In [53]:
import csv

from sklearn import naive_bayes, dummy, ensemble, neighbors, tree, feature_extraction, calibration, linear_model, multiclass, svm
import pandas as pd

In [92]:
learn_data = pd.read_csv("data/sampled_tweets200_categorized.csv")
test_data = pd.read_csv("data/sampled_tweets_categorized.csv")
complete_data = pd.read_csv("data/target_data.csv")

In [39]:
def test_performance(classifiers, vectorizers, train, test):
    for classifier in classifiers:
        for vec in vectorizers:
            string = ""
            string += classifier.__class__.__name__ + " with " + vec.__class__.__name__

            vectorize_text = vec.fit_transform(train["Text"])
            classifier.fit(vectorize_text, train["Category"])

            vectorize_text = vec.transform(test["Text"])
            score = classifier.score(vectorize_text, test["Category"])

            string += ". has score: " + str(score)

            print(string)

In [43]:
classifiers = [naive_bayes.BernoulliNB(), ensemble.RandomForestClassifier(n_estimators= 100, n_jobs= -1), ensemble.AdaBoostClassifier(),
               ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), tree.DecisionTreeClassifier(),
               calibration.CalibratedClassifierCV(), dummy.DummyClassifier(), linear_model.PassiveAggressiveClassifier(),
               linear_model.RidgeClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), neighbors.KNeighborsClassifier(),
               multiclass.OneVsRestClassifier(svm.SVC(kernel = "linear")), multiclass.OneVsRestClassifier(linear_model.LogisticRegression())
               ]

In [44]:
vectorizers = [feature_extraction.text.CountVectorizer(), feature_extraction.text.TfidfVectorizer(), feature_extraction.text.HashingVectorizer()]

In [45]:
test_performance(classifiers, vectorizers, learn_data, test_data)

BernoulliNB with CountVectorizer. has score: 0.66
BernoulliNB with TfidfVectorizer. has score: 0.66
BernoulliNB with HashingVectorizer. has score: 0.65
RandomForestClassifier with CountVectorizer. has score: 0.67
RandomForestClassifier with TfidfVectorizer. has score: 0.65
RandomForestClassifier with HashingVectorizer. has score: 0.71
AdaBoostClassifier with CountVectorizer. has score: 0.52
AdaBoostClassifier with TfidfVectorizer. has score: 0.6
AdaBoostClassifier with HashingVectorizer. has score: 0.63
BaggingClassifier with CountVectorizer. has score: 0.6
BaggingClassifier with TfidfVectorizer. has score: 0.52
BaggingClassifier with HashingVectorizer. has score: 0.61
ExtraTreesClassifier with CountVectorizer. has score: 0.65
ExtraTreesClassifier with TfidfVectorizer. has score: 0.68
ExtraTreesClassifier with HashingVectorizer. has score: 0.71
GradientBoostingClassifier with CountVectorizer. has score: 0.62
GradientBoostingClassifier with TfidfVectorizer. has score: 0.53
GradientBoost

In [46]:
classifiers_to_test = [calibration.CalibratedClassifierCV(), linear_model.RidgeClassifier(), multiclass.OneVsRestClassifier(linear_model.LogisticRegression())]
vecs_to_test = [feature_extraction.text.HashingVectorizer(), feature_extraction.text.CountVectorizer()]

In [86]:
current_classifier = classifiers_to_test[2]
current_vectorizer = vecs_to_test[1]

In [87]:
vectorize_text = current_vectorizer.fit_transform(learn_data.Text)
current_classifier.fit(vectorize_text, learn_data.Category)

In [88]:
vectorize_text = current_vectorizer.transform(test_data.Text)
score = current_classifier.score(vectorize_text, test_data.Category)
print(score)

0.74


In [89]:
csv_arr = []
for index, row in test_data.iterrows():
    text = row[0]
    answer = row[1]
    vectorize_text  = current_vectorizer.transform([text])
    predict = current_classifier.predict(vectorize_text)[0]

    if predict == answer:
        result = "correct"
    else:
        result = "incorrect"
    csv_arr.append([len(csv_arr), text, answer, predict, result])

In [90]:
with open("data/test_score_3.csv", "w", newline = "") as f:
    writer = csv.writer(f)
    writer.writerow(["#", "Text", "Answer", "Prediction", "Result"])

    for row in csv_arr:
        writer.writerow(row)

In [100]:
def predict(message, vectorizer, classifier):
    vectorize_message = vectorizer.transform([message])
    predict = classifier.predict(vectorize_message)[0]
    return predict

In [110]:
preds_arr = []
for msg in complete_data.Text:
    res = predict(msg, current_vectorizer, current_classifier)
    preds_arr.append([msg, res])

In [106]:
test_msg = "Foreign Affairs C.S Amb Raychelle Omamo today bid farewell to H.E Loh Sock Tiong, outgoing High Commissioner of Malaysia after his tour of duty. The C.S praised efforts made by the High Commission"

In [116]:
with open("data/predictions.csv", "w", encoding = "utf-8", newline = "") as f:
    writer = csv.writer(f)
    writer.writerow(["Text", "Prediction"])

    for row in preds_arr:
        writer.writerow(row)