In [2]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  # Import accuracy_score

# Read data using pandas (replace with your actual file path)
data = pd.read_csv("labelled_train_set.csv")

# Assuming the last column is your target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert 'object' type columns to numerical or categorical
# You'll need to decide how to handle the 'ID' and 'News/Comment' columns
# For example, if 'ID' is not a predictive feature, you might drop it:
X = X.drop('ID', axis=1)

# If 'News/Comment' contains text, you might need to use text vectorization techniques
# Here's a simple example using one-hot encoding if it's categorical:
X = pd.get_dummies(X, columns=['News/Comment'])

# Convert boolean or string labels to numerical (0 and 1)
# Handle all non-binary values appropriately
# Check for unique values in your target variable to ensure you've covered all cases
print(y.unique())  # Print unique values to identify any remaining non-binary labels

# Map all non-TRUE values to 0, including 'MOSTLY TRUE'
y = y.replace({'TRUE': 1, 'FALSE': 0, 'MOSTLY FALSE': 0, 'PARTLY FALSE': 0, 'HALF TRUE': 1, 'MOSTLY TRUE': 1})

# Split data into train and test sets
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to DMatrix format
dtrain = xgb.DMatrix(train_X, label=train_Y)
dtest = xgb.DMatrix(test_X, label=test_Y)

# Fit the model
num_round = 2
param = {"objective": "binary:logistic", "eta": 1, "max_depth": 2}
bst = xgb.train(param, dtrain, num_round)

# Make predictions
pred = bst.predict(dtest)

# Convert predictions to binary (0 or 1) based on a threshold (usually 0.5)
pred_binary = (pred > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(test_Y, pred_binary)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Function to predict the type of new data (assuming you have a tokenizer and pad_sequences defined elsewhere)
def predict_type(text):
    text_seq = tokenizer.texts_to_sequences([text])
    text_pad = pad_sequences(text_seq, maxlen=maxlen)
    # Use bst to make predictions
    prediction = bst.predict(xgb.DMatrix(text_pad))
    return 'TRUE' if prediction[0] > 0.5 else 'FALSE'

['FALSE' 'HALF TRUE' 'MOSTLY FALSE' 'PARTLY FALSE' 'MOSTLY TRUE']
Test Accuracy: 92.86%


In [3]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb

# Define the parameters properly as a dictionary
param = {'objective': 'binary:logistic', 'eta': 1, 'max_depth': 2, 'eval_metric': 'auc'}

# Example with 5-fold cross-validation
scores = cross_val_score(xgb.XGBClassifier(**param), X, y, cv=5, scoring='accuracy') # Use ** to unpack the dictionary
print("Cross-validation scores:", scores)

Cross-validation scores: [0.90909091 0.90909091 0.91205212 0.90879479 0.90879479]


In [4]:
# Assuming 'X' is the DataFrame you want to use for testing
# and it has already been preprocessed in the same way (e.g., one-hot encoding)

# Convert test data to DMatrix format
dtest_new = xgb.DMatrix(X)  # Use 'X' instead of 'test_data'

# Make predictions on the new test data
predictions = bst.predict(dtest_new)

# Convert predictions to binary labels
predicted_labels = (predictions > 0.5).astype(int)

# Add the predicted labels to the X DataFrame
X['Predicted_Label'] = predicted_labels  # Use 'X' here as well

# If you want to map the numerical labels back to 'TRUE' and 'FALSE':
X['Predicted_Label'] = X['Predicted_Label'].map({1: 'TRUE', 0: 'FALSE'})

# Now you have the predicted labels in the 'Predicted_Label' column of your X DataFrame
print(X.head())

   News/Comment_"Sorry" എന്ന ഇന്ത്യൻ ഷോട്ട് ഫിലിമിന് ഓസ്കാർ ലഭിച്ചു.  \
0                                              False                   
1                                              False                   
2                                              False                   
3                                              False                   
4                                              False                   

   News/Comment_"അയ്യപ്പ വിശ്വാസികളുടെ വോട്ട് സി.പി.എമ്മിന് വേണ്ട. കമ്മ്യൂണിസ്റ്റ് പാർട്ടി തരംതാണിട്ടില്ല, " മുഖ്യമന്ത്രി പിണറായി വിജയൻ പറഞ്ഞു  \
0                                              False                                                                                             
1                                              False                                                                                             
2                                              False                                                                             

In [None]:
# prompt: this is my model, i want to run my test data on this, give me the code for it

# Assuming 'test_data' is a DataFrame containing your test features, preprocessed in the same way as your training data
dtest_new = xgb.DMatrix(test_data)

# Make predictions on the new test data
predictions = bst.predict(dtest_new)

# Convert predictions to binary labels (0 or 1) based on a threshold (usually 0.5)
predicted_labels = (predictions > 0.5).astype(int)

# If you want to map the numerical labels back to 'TRUE' and 'FALSE':
predicted_labels_text = ['TRUE' if label == 1 else 'FALSE' for label in predicted_labels]

# Print or store the predicted labels
print(predicted_labels_text)


In [5]:
import xgboost as xgb
import pandas as pd

# Load the unlabelled test data
test_data = pd.read_csv('unlabelled_test1.csv')

# Preprocess the test data similarly as done with the training data
# Assuming 'ID' is also in the test set, drop it
X_test = test_data.drop('ID', axis=1)

# Convert 'News/Comment' using the same method as in training
X_test = pd.get_dummies(X_test, columns=['News/Comment'])

# Ensure the test data has the same columns as the training data
missing_cols = set(train_X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[train_X.columns]

# Convert the test data to DMatrix format
dtest = xgb.DMatrix(X_test)

# Make predictions using the trained model
pred = bst.predict(dtest)

# Convert predictions to binary (0 or 1) based on a threshold (usually 0.5)
pred_binary = (pred > 0.5).astype(int)

# Convert binary predictions back to labels
pred_labels = ['TRUE' if p == 1 else 'FALSE' for p in pred_binary]

# Add predictions to the original test data
test_data['Prediction'] = pred_labels

# Save the predictions to a new CSV file
test_data.to_csv('unlabelled_test1_with_predictions.csv', index=False)

print("Predictions saved to unlabelled_test1_with_predictions.csv")


  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[c

Predictions saved to unlabelled_test1_with_predictions.csv


In [10]:
import xgboost as xgb
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the labelled training data
train_data = pd.read_csv("labelled_train_set.csv")

# Preprocess training data
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

# Drop 'ID' from features
X_train = X_train.drop('ID', axis=1)

# Convert 'News/Comment' to dummies
X_train = pd.get_dummies(X_train, columns=['News/Comment'])

# Convert target labels to binary
y_train = y_train.replace({'TRUE': 1, 'FALSE': 0, 'MOSTLY FALSE': 0, 'PARTLY FALSE': 0, 'HALF TRUE': 1, 'MOSTLY TRUE': 1})

# **Convert boolean columns to integers before applying SMOTE**
for column in X_train.columns:
    if X_train[column].dtype == bool:
        X_train[column] = X_train[column].astype(int)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Split the resampled data into train and test sets
train_X, test_X, train_Y, test_Y = train_test_split(X_train_smote, y_train_smote, test_size=0.2, random_state=42)

# Convert to DMatrix format
dtrain = xgb.DMatrix(train_X, label=train_Y)
dtest = xgb.DMatrix(test_X, label=test_Y)

# Train the model
param = {"objective": "binary:logistic", "eta": 1, "max_depth": 2}
bst = xgb.train(param, dtrain, num_boost_round=10)

# Load your unlabelled test data
test_data = pd.read_csv('unlabelled_test1.csv')

# Keep the 'ID' column
ids = test_data['ID']

# Preprocess test data similarly
X_test = test_data.drop('ID', axis=1)
X_test = pd.get_dummies(X_test, columns=['News/Comment'])

# Ensure the test data has the same columns as the training data
missing_cols = set(train_X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[train_X.columns]

# **Convert boolean columns to integers in test data as well**
for column in X_test.columns:
    if X_test[column].dtype == bool:
        X_test[column] = X_test[column].astype(int)

# Convert test data to DMatrix format
dtest = xgb.DMatrix(X_test)

# Make predictions using the trained model
pred = bst.predict(dtest)

# Convert predictions to binary (0 or 1) based on a threshold (usually 0.5)
pred_binary = (pred > 0.5).astype(int)

# Add numeric predictions and ID back to the test data
test_data['Predicted_Type'] = pred_binary
test_data['ID'] = ids

# Save the predictions to a new CSV file with ID and labels
output_file_path = 'unlabelled_test1_with_predictions_smote.csv'
test_data[['ID', 'Predicted_Type']].to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[col] = 0
  X_test[c

Predictions saved to unlabelled_test1_with_predictions_smote.csv
