In [16]:
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
import pickle

# Import data and set variables

In [17]:
# Import data
train = pd.read_csv('Train.csv')

# Split up in X and y columns
X_train = train.loc[:, train.columns != 'has_Incident_Related']
y_train = train[['has_Incident_Related']]

# Variable with the text of a tweet without hashtags 
X_train_text_no_hashtag = X_train.loc[:, 'preprocessed_text_no_hashtag']

# Variable with the text of a tweet with hashtags as normal text
X_train_text = X_train.loc[:, 'preprocessed_text']

# Variable with whether a tweet has media (0 or 1)
X_train_media = X_train.loc[:, 'has_media'].astype('int')

# Response variable, whether the tweet is related to an incident (0 or 1)
y_train = y_train.loc[:, 'has_Incident_Related'].astype('int')

# Convert text data into tf-idf

In [18]:
# Convert X_train_text_no_hashtag into tf-idf vectorized matrix
vc_text_no_hashtag = TfidfVectorizer().fit(X_train_text_no_hashtag)
X_train_text_no_hashtag_df = pd.DataFrame(vc_text_no_hashtag.transform(X_train_text_no_hashtag).todense(), columns=vc_text_no_hashtag.get_feature_names())

# Convert X_train_text into tf-idf vectorized matrix
vc_text = TfidfVectorizer().fit(X_train_text)
X_train_text_df = pd.DataFrame(vc_text.transform(X_train_text).todense(), columns=vc_text.get_feature_names())

In [19]:
# Reset index for all variables
X_train_text_no_hashtag_df.reset_index(inplace=True, drop=True)
X_train_text_df.reset_index(inplace=True, drop=True)
X_train_media.reset_index(inplace=True, drop=True)

train_data_1 = pd.concat([X_train_text_no_hashtag_df, X_train_media], axis=1, ignore_index=True)
train_data_2 = pd.concat([X_train_text_df, X_train_media], axis=1, ignore_index=True)

In [20]:
# Gamma is only used with rbf, poly and sigmoid kernels
parameters1 = {'C': [0.1, 1, 10, 100], 
              'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001, 'scale', 'auto'],
              'kernel': ['rbf', 'poly', 'sigmoid']
              }

# Kernels that do not use gamma
parameters2 = {'C': [0.1, 1, 10, 100], 
              'kernel': ['linear']
              }

def get_best_parameters(parameters, train, test):
    clf = GridSearchCV(
        SVC(), parameters, cv=3, scoring='f1', n_jobs=4, verbose=True
    )
    clf.fit(train, test)
    print(clf.best_score_)
    print(clf.best_params_)

In [21]:
print("Model 1 with parameters1")
get_best_parameters(parameters1, X_train_text_no_hashtag_df, y_train)

Model 1 with parameters1
Fitting 3 folds for each of 96 candidates, totalling 288 fits
0.887210695306651
{'C': 100, 'gamma': 1, 'kernel': 'rbf'}


In [22]:
print("Model 1 with parameters2")
get_best_parameters(parameters2, X_train_text_no_hashtag_df, y_train)

Model 1 with parameters2
Fitting 3 folds for each of 4 candidates, totalling 12 fits
0.8563174742848063
{'C': 10, 'kernel': 'linear'}


In [23]:
print("Model 2 with parameters1")
get_best_parameters(parameters1, X_train_text_df, y_train)

Model 2 with parameters1
Fitting 3 folds for each of 96 candidates, totalling 288 fits
0.8932724867888974
{'C': 100, 'gamma': 1, 'kernel': 'rbf'}


In [24]:
print("Model 2 with parameters2")
get_best_parameters(parameters2, X_train_text_df, y_train)

Model 2 with parameters2
Fitting 3 folds for each of 4 candidates, totalling 12 fits
0.860880353904688
{'C': 10, 'kernel': 'linear'}


In [25]:
print("Model 3 with parameters1")
get_best_parameters(parameters1, train_data_1, y_train)

Model 3 with parameters1
Fitting 3 folds for each of 96 candidates, totalling 288 fits
0.8805361206084812
{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}


In [26]:
print("Model 3 with parameters2")
get_best_parameters(parameters2, train_data_1, y_train)

Model 3 with parameters2
Fitting 3 folds for each of 4 candidates, totalling 12 fits
0.8612955075528902
{'C': 10, 'kernel': 'linear'}


In [None]:
print("Model 4 with parameters1")
get_best_parameters(parameters1, train_data_2, y_train)

Model 4 with parameters1
Fitting 3 folds for each of 96 candidates, totalling 288 fits


In [None]:
print("Model 4 with parameters2")
get_best_parameters(parameters2, train_data_2, y_train)

In [None]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf_svm1 = SVC()
scores = cross_validate(clf_svm1, X_train_text_no_hashtag_df, y_train, cv=k_fold, scoring='f1', return_train_score=True, n_jobs=4, verbose=1)

print("Model trained on text of the tweet without hashtag")
print("Train score: ", scores["train_score"].mean())
print("Test score: ", scores["test_score"].mean())
print("-----------------------------")

clf_svm2 = SVM()
scores = cross_validate(clf_svm2, X_train_text_df, y_train, cv=k_fold, scoring='f1', return_train_score=True, n_jobs=4, verbose=1)

print("Model trained on text of the tweet with hashtag as normal text")
print("Train score: ", scores["train_score"].mean())
print("Test score: ", scores["test_score"].mean())
print("-----------------------------")

clf_svm3 = SVM()
scores = cross_validate(clf_svm3, train_data_1, y_train, cv=k_fold, scoring='f1', return_train_score=True, n_jobs=4, verbose=1)

print("Model trained on text of the tweet without hashtag and the image")
print("Train score: ", scores["train_score"].mean())
print("Test score: ", scores["test_score"].mean())
print("-----------------------------")

clf_svm4 = SVM()
scores = cross_validate(clf_svm4, train_data_2, y_train, cv=k_fold, scoring='f1', return_train_score=True, n_jobs=4, verbose=1)

print("Model trained on text of the tweet with hashtag as normal text and the image")
print("Train score: ", scores["train_score"].mean())
print("Test score: ", scores["test_score"].mean())
print("-----------------------------")

In [None]:
clf_svm1.fit(X_train_text_no_hashtag_df, y_train)
clf_svm2.fit(X_train_text_df, y_train)
clf_svm3.fit(train_data_1, y_train)
clf_svm4.fit(train_data_2, y_train)

In [None]:
# Import the Test data
test = pd.read_pickle('~/Documents/Github Repository/early-warning-twitter/Processed datasets/Model data/Test.pkl')

X_test = test.loc[:, train.columns != 'has_Incident_Related']
y_test = test[['has_Incident_Related']]

# Variable with the text of a tweet without hashtags 
X_test_text_no_hashtag = X_test.loc[:, 'preprocessed_text_no_hashtag']

# Variable with the text of a tweet with hashtags as normal text
X_test_text = X_test.loc[:, 'preprocessed_text']

# Variable with whether a tweet has media (0 or 1)
X_test_media = X_test.loc[:, 'has_media'].astype('int')

# Response variable, whether the tweet is related to an incident (0 or 1)
y_test = y_test.loc[:, 'has_Incident_Related'].astype('int')

In [None]:
# Convert X_test_text_no_hashtag into tf-idf vectorized matrix
X_test_text_no_hashtag_df = pd.DataFrame(vc_text_no_hashtag.transform(X_test_text_no_hashtag).todense(), columns=vc_text_no_hashtag.get_feature_names())

# Convert X_test_text into tf-idf vectorized matrix
X_test_text_df = pd.DataFrame(vc_text.transform(X_test_text).todense(), columns=vc_text.get_feature_names())

# Reset index for all variables
X_test_text_no_hashtag_df.reset_index(inplace=True, drop=True)
X_test_text_df.reset_index(inplace=True, drop=True)
X_test_media.reset_index(inplace=True, drop=True)

test_data_1 = pd.concat([X_test_text_no_hashtag_df, X_test_media], axis=1, ignore_index=True)
test_data_2 = pd.concat([X_test_text_df, X_test_media], axis=1, ignore_index=True)

In [None]:
def predict_data(model, test_data, test_y):
    # predict the new document from the testing dataset
    y_pred = model.predict(test_data)

    # Predict the probabilities for the testing dataset (necessary for AUC)
    y_pred_prob = model.predict_proba(test_data)

    # compute the performance measures
    score1 = metrics.accuracy_score(test_y, y_pred)
    print("First model!")
    print("accuracy:   %0.3f" % score1)

    print(metrics.classification_report(test_y, y_pred,
                                                target_names=['Negative', 'Positive']))

    print("confusion matrix:")
    print(metrics.confusion_matrix(test_y, y_pred))

    print('------------------------------')

    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    import matplotlib.pyplot as plt

    # Only keep probabilities for the positive outcome only
    y_pred_prob = y_pred_prob[:, 1]

    nb_auc = roc_auc_score(test_y, y_pred_prob)
    print('Naive Bayes: ROC AUC=%.3f' % (nb_auc))
    nb_fpr, nb_tpr, _ = roc_curve(test_y, y_pred_prob)
    plt.plot(nb_fpr, nb_tpr, marker='.', label='Logistic')
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()
    print("-------------------------------------")

In [None]:
predict_data(clf_svm1, X_test_text_no_hashtag_df, y_test)
predict_data(clf_svm2, X_test_text_df, y_test)
predict_data(clf_svm3, test_data_1, y_test)
predict_data(clf_svm4, test_data_2, y_test)

In [None]:
# Save models
pickle.dump(clf_nb1, open('support-vector-machines-1.sav', 'wb'))
pickle.dump(clf_nb2, open('support-vector-machines-2.sav', 'wb'))
pickle.dump(clf_nb3, open('support-vector-machines-3.sav', 'wb'))
pickle.dump(clf_nb4, open('support-vector-machines-4.sav', 'wb'))