In [1]:
from preprocess import *
from utilities import *
from constants import *
from supervised_sentiment_analysis import *


ImportError: No module named 'nltk'

- Reformat results and update them with the manual labels from the two files
- Filter data to only include labeled results and get number of labels

In [None]:
import pickle
merged_results = pickle.load(open('merged_results.pickle', 'rb'))
merged_results = get_manual_tags(merged_results, 'manual_tags_Q1.csv')
merged_results = get_manual_tags(merged_results, 'manual_tags_Q2.csv')
labeled_results = merged_results[~(merged_results['Manual Tag'] == 'no tag')]
labeled_results.shape

- get a summary of the accuracy of a RandomForestClassifier on the results
    - see [here](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#) for info on the Random Forest Classifier
    - data is split 70-30 between training and testing
    - "Regular Word Counts" means the feature vectors are raw counts of how many times a word occurs in each text
    - "TFIDF Normalized Word Counts" means the feature vectors are normalized using term frequency–inverse document frequency
        - see [here](https://en.wikipedia.org/wiki/Tf–idf) for info on tf-idf
    - Summaries given for classfication on all data, just question 1, and just question 2
    - labels are given as \[-1, -0.5, 0, 0.5, 1\] indicating negative, slightly negative, neutral, slightly positive, or positive responses, respectively

In [None]:
q1_merged_results = merged_results[merged_results['Question']==Q1]
q2_merged_results = merged_results[merged_results['Question']==Q1]
for data in [merged_results, q1_merged_results, q2_merged_results]:
    random_forest_accuracy_summary(data)
    print()

- Exactly the same as above, except responses are classified as negative (-1 or -0.5), neutral (0) or positive (0.5 or 1)

In [None]:
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]
get_word_counts(q1_labeled_results).shape

In [None]:
for data in [merged_results, q1_merged_results, q2_merged_results]:
    random_forest_accuracy_summary(data, key = 'Categorical Tag')
    print()

- plot confusion matrices using Random Forest Classifier (rfc), Gaussian Naive Bayes (gnb), and Support Vector Classification (svc)
    - Info on [Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#), [Gaussian Naive Bayes](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB), and [Support Vector Classification](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
    - Confusion matrices are made based on positive, negative, or neutral responses
    - Numbers in heatmap indicate total counts and normalized counts for each cell
    - Given for Question 1 results, Question 2 results, and combined results in that order

In [None]:
confusion_matrix_summary(merged_results, model = 'rfc')

In [None]:
confusion_matrix_summary(merged_results, model = 'gnb')

In [None]:
confusion_matrix_summary(merged_results, model = 'svc')

In [None]:
TRIALS = 1
SPLITS = 100

In [None]:
q1_labeled_results = labeled_results[labeled_results['Question'] == Q1]
features = get_word_counts(q1_labeled_results)
features_tfidf = get_tfidf(q1_labeled_results)
features = features_tfidf
labels = np.array(q1_labeled_results['Categorical Tag'])
conf_matrices = get_conf_matrices(features, labels, clfs = ['svc', 'gnb', 'rfc'], trials = TRIALS, splits = SPLITS, test_size = 0.3)

In [None]:
get_average_heatmaps({'svc': conf_matrices['svc']}, trials = TRIALS, splits = SPLITS, normalize = True)

In [None]:
cv_count = 10
get_cross_validation_scores(features, labels, svm.SVC(), cv_count = cv_count)
get_cross_validation_scores(features, labels, RandomForestClassifier(), cv_count = cv_count)
get_cross_validation_scores(features.toarray(), labels, GaussianNB(), cv_count = cv_count)

In [None]:
svc = svm.SVC()
rfc = RandomForestClassifier()
gnb = GaussianNB()
for clf in [svc, rfc, gnb]:
    arr = np.array(get_kappa(features.toarray(), labels, clf, trials = 100))
    print(arr.mean(), 2*arr.std())
    

In [None]:
def macro_F(conf_matrix):
    dim = len(conf_matrix)
    F_scores = []
    for k in range(dim):
        correct = conf_matrix[k][k]
        false_positive = sum([conf_matrix[j][k] for j in range(dim)])-correct
        false_negative = sum([conf_matrix[k][j] for j in range(dim)])-correct
        precision = correct/(correct+false_positive)
        recall = correct/(correct+false_negative)
        F = 2*precision*recall/(precision+recall)
        F_scores.append(F)
    return np.mean(F_scores)

In [None]:
for clf, conf_matrix in conf_matrices.items():
    macro_F_scores = []
    print(clf)
    for conf_matrix in conf_matrix:
        macro_F_scores.append(macro_F(conf_matrix))
    print((np.mean(macro_F_scores), 2*np.std(macro_F_scores)))