**Imports**

In [204]:
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

**Loading Data**

In [205]:
source_X_train = joblib.load('Processed Data/pp_train_sources.data')
source_X_dev = joblib.load('Processed Data/pp_dev_sources.data')
source_X_test= joblib.load('Processed Data/pp_test_sources.data')

#replies_X_train = joblib.load('Processed Data/pp_train_replies.data')
#replies_X_dev= joblib.load('Processed Data/pp_dev_replies.data')
#replies_X_test = joblib.load('Processed Data/pp_test_replies.data')

**Separating Labels from Features**

In [206]:
source_y_train = source_X_train.pop('class')
source_y_dev = source_X_dev.pop('class')
source_X_test.drop(['class'], axis=1, inplace=True)

#replies_y_train = replies_X_train.pop('source class')
#replies_y_dev = replies_X_dev.pop('source class')
#replies_X_test.drop(['source class'], axis=1, inplace=True)

**Model Testing**

In [207]:
def check_val_accuracy(preds, source_y_dev=source_y_dev):
    correct = 0
    for id, pred in enumerate(preds):
        if pred == source_y_dev[id]:
            correct += 1

    return correct / len(preds)

In [208]:
best_accuracy = 0
best_c = 0
best_pred = []
for c in np.arange(0.1, 2, 0.2):
    lr_classif = LogisticRegression(C=c, max_iter=1000)

    lr_classif.fit(source_X_train, source_y_train)
    dev_pred = lr_classif.predict(source_X_dev)
    acc = check_val_accuracy(dev_pred)

    if acc > best_accuracy:
        best_accuracy = acc
        best_c = c
        best_pred = dev_pred

In [209]:
best_accuracy

0.9214953271028037

In [210]:
best_c

0.9000000000000001

In [211]:
print(classification_report(source_y_dev, dev_pred))

              precision    recall  f1-score   support

   nonrumour       0.93      0.96      0.95       420
      rumour       0.85      0.74      0.79       115

    accuracy                           0.92       535
   macro avg       0.89      0.85      0.87       535
weighted avg       0.91      0.92      0.91       535



In [212]:
import pandas as pd

In [213]:

best_accuracy = 0
best_c = 0
best_pred = []
for c in np.arange(0.1, 3, 0.2):
    classif = SVC(C=c)

    classif.fit(source_X_train, source_y_train)
    dev_pred = classif.predict(source_X_dev)
    acc = check_val_accuracy(dev_pred)

    if acc > best_accuracy:
        best_accuracy = acc
        best_c = c
        best_pred = dev_pred


In [214]:
best_accuracy

0.9289719626168225

In [215]:
best_c

1.5000000000000004

In [216]:
print(classification_report(source_y_dev, best_pred))

              precision    recall  f1-score   support

   nonrumour       0.94      0.98      0.96       420
      rumour       0.90      0.76      0.82       115

    accuracy                           0.93       535
   macro avg       0.92      0.87      0.89       535
weighted avg       0.93      0.93      0.93       535



In [217]:
test_pred = classif.predict(source_X_test)

In [218]:
pred_df = pd.DataFrame(test_pred, columns=["Predicted"])
pred_df.name = 'id'
pred_df.loc[pred_df['Predicted'] == 'nonrumour'] = 0
pred_df.loc[pred_df['Predicted'] == 'rumour'] = 1
pred_df

Unnamed: 0,Predicted
0,0
1,0
2,0
3,0
4,0
...,...
553,0
554,1
555,1
556,0


In [219]:
pred_df.to_csv('predictions2.csv', index=True, header=True)