In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup

from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import nltk

from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV



In [2]:
emotions ={1: "Anger",
2: "Anticipation",
3: "Disgust",
4: "Fear",
5: "Joy",
6: "Sadness",
7: "Surprise",
8: "Trust"}

## Getting the data

In [3]:
def get_data(path):

    df = pd.read_csv(path, sep = '\t', names = ['Sentence', 'Feeling'], encoding = 'utf-8')
    df = df.iloc[1:]
    
    return df

In [4]:
train_data = get_data(r"C:\Users\conde\OneDrive\OneDrive Docs\Documents\Masters\2nd semester\Text Mining\Project\TextMining\training_set.txt")
val_data = get_data(r"C:\Users\conde\OneDrive\OneDrive Docs\Documents\Masters\2nd semester\Text Mining\Project\TextMining\dev_set.txt")

In [5]:
train_data

Unnamed: 0,Sentence,Feeling
1,I'm too old to be traded in .,6
2,Mother said you could always tell a lady by he...,8
3,I always said I'd leave off when the time came .,6
4,He'll be safe with me .,2
5,Lay off .,1
...,...,...
13996,"Yes , Commissioner .",8
13997,The ring !,7
13998,That is my greatest and most enduring love .,5
13999,"When I came back from the war , I had a son .",5


In [6]:
val_data

Unnamed: 0,Sentence,Feeling
1,What happens to the gold in our safe ?,4
2,Natural to get cold feet .,8
3,"Not very lucky , is he ?",7
4,I'm just a little anxious to get up there and ...,2
5,Did you think we don't know about your affair ...,1
...,...,...
996,All I ask of you is be careful .,4
997,"You don't like jazz , pal ?",7
998,Put it on .,1
999,Can you ever imagine [PERSON] being in a spot ...,7


## Preprocessing

In [7]:
nltk.download('wordnet')
nltk.download('stopwords') # removes words that are useless to prediction - connections and whatnot.

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\conde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\conde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\conde\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
def process(dataframe):
    
    processed_corpus = []
    stop_words = set(stopwords.words("english"))
    
    for i in tqdm(range(len(dataframe))):
        text = dataframe['Sentence'].iloc[i]
        
        # Remove website tags - not sure if needed in this problem. This code was copied from class
        text = BeautifulSoup(text).get_text()
        
        # Remove punctuations - this is needed
        text = re.sub('[^a-zA-Z]', ' ', text)

        # Convert to lowercase
        text = text.lower()        
        
        # Convert to list from string
        text = text.split()

        # Lemmatization
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in stop_words] 
        text = " ".join(text)
        processed_corpus.append(text)
        
    dataframe['Sentence'] = processed_corpus
        
    return dataframe

## Feature Extraction

In [10]:
def bag_of_words(train_df, val_df):
    
    vectorizer = CountVectorizer(max_features = 5000)

    vectorizer.fit(train_df['Sentence'])
    feature_space = vectorizer.transform(val_df['Sentence'])
    
    return feature_space.toarray()

In [11]:
bag_of_words(train_data, train_data).shape

(14000, 5000)

In [12]:
def n_grams(train_df, val_df):

    vectorizer = CountVectorizer(ngram_range = (2,2))

    vectorizer.fit(train_df['Sentence'])
    feature_space = vectorizer.transform(val_df['Sentence'])

    return feature_space.toarray()

In [13]:
def tf_idf(train_df, val_df):

    vectorizer = TfidfVectorizer()

    vectorizer.fit(train_df['Sentence'])
    feature_space = vectorizer.transform(val_df['Sentence'])

    return feature_space

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer

In [15]:
def tf_idf_n_grams(train_df, val_df):
    vectorizer = CountVectorizer(ngram_range = (2,2))
    transformer = TfidfTransformer()

    vectorizer.fit(train_df['Sentence'])
    transformer.fit(vectorizer)
    feature_space = transformer.transform(val_df['Sentence'])

    return feature_space

## Modelling

In [16]:
# lr = LogisticRegression()
# svc = SVC()
# knn = KNeighborsClassifier()
# nb = GaussianNB()

In [17]:
# models = [lr, svc, knn, nb]

In [18]:
# features = [bag_of_words, n_grams, tf_idf]

In [19]:
# logistic_params = {
#     'C': [0.1, 0.5],
#     'penalty': ['l1', 'none'],
#     'solver': ['lbfgs', 'saga']
# }
# svc_params = {
#     'C': [0.5, 1],
#     'kernel': ['sigmoid', 'rbf']
# }
# knc_params = {
#     'n_neighbors': [5, 10],
#     'weights' : ['uniform', 'distance']
# }
# nb_params = {}

In [20]:
# params_list = [logistic_params, svc_params, knc_params, nb_params]

In [21]:
# for feature in features:
#     x = 0
#     for model in models:
#         gs = GridSearchCV(estimator=model,
#                          param_grid=params_list[x],
#                          scoring='accuracy',
#                          verbose=1,
#                          n_jobs=-1).fit(feature(train_data, train_data), train_data['Feeling'])
#         x += 1
#         y_train = gs.predict(feature(train_data, train_data))
#         y_predict = gs.predict(feature(train_data, val_data))

#         print(f"Training Accuracy of {model} using {feature} %.3f" 
#         %accuracy_score(train_data['Feeling'], y_train))

#         print(f"Training F1 of {model} using {feature} %.3f" 
#         %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#         print(f"Val Accuracy of {model} using {feature} %.3f" 
#         %accuracy_score(val_data['Feeling'], y_predict))

#         print(f"Val F1 of {model} using {feature} %.3f" 
#         %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

In [22]:
# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(bag_of_words(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(bag_of_words(train_data, train_data))
#     y_predict = gs.predict(bag_of_words(train_data, val_data))

#     print(f"Training Accuracy of {model} using bag_of_words %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using bag_of_words %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using bag_of_words %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using bag_of_words %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

In [23]:
# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(n_grams(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(n_grams(train_data, train_data))
#     y_predict = gs.predict(n_grams(train_data, val_data))

#     print(f"Training Accuracy of {model} using n_grams %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using n_grams %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using n_grams %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using n_grams %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

In [24]:
# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(tf_idf(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(tf_idf(train_data, train_data))
#     y_predict = gs.predict(tf_idf(train_data, val_data))

#     print(f"Training Accuracy of {model} using tf_idf %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using tf_idf %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using tf_idf %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using tf_idf %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

Modelling Second Epoch

In [25]:
# models = [lr, svc]

In [26]:
# logistic_params = {
#     'C': [0.1, 0.5],
#     'penalty': ['l1', 'none'],
#     'solver': ['lbfgs', 'saga']
# }
# svc_params = {
#     'C': [0.1, 0.5, 1, 10, 100],
#     'kernel': ['linear', 'rbf', 'poly'],
#     'gamma' : [0.1, 1, 10, 100]
# }

In [27]:
# params_list = [logistic_params, svc_params]

In [28]:
# features = [bag_of_words, tf_idf, n_grams, tf_idf_n_grams]

In [29]:
# #Normal Bag of words

# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(bag_of_words(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(bag_of_words(train_data, train_data))
#     y_predict = gs.predict(bag_of_words(train_data, val_data))

#     print(f"Training Accuracy of {model} using bag_of_words %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using bag_of_words %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using bag_of_words %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using bag_of_words %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

#     print('Best parameters found:\n', gs.best_params_)

#     print('---------------------------------------------------------------------------------------------------------------------')
#     # All results
#     means = gs.cv_results_['mean_test_score']
#     stds = gs.cv_results_['std_test_score']
#     for mean, std, params in zip(means, stds, gs.cv_results_['params']):
#         print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [30]:
# #TF-IDF

# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(tf_idf(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(tf_idf(train_data, train_data))
#     y_predict = gs.predict(tf_idf(train_data, val_data))

#     print(f"Training Accuracy of {model} using tf_idf %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using tf_idf %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using tf_idf %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using tf_idf %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

#     print('Best parameters found:\n', gs.best_params_)

#     print('---------------------------------------------------------------------------------------------------------------------')
#     # All results
#     means = gs.cv_results_['mean_test_score']
#     stds = gs.cv_results_['std_test_score']
#     for mean, std, params in zip(means, stds, gs.cv_results_['params']):
#         print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [31]:
# # N_Grams

# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(n_grams(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(n_grams(train_data, train_data))
#     y_predict = gs.predict(n_grams(train_data, val_data))

#     print(f"Training Accuracy of {model} using n_grams %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using n_grams %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using n_grams %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using n_grams %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

#     print('Best parameters found:\n', gs.best_params_)

#     print('---------------------------------------------------------------------------------------------------------------------')
#     # All results
#     means = gs.cv_results_['mean_test_score']
#     stds = gs.cv_results_['std_test_score']
#     for mean, std, params in zip(means, stds, gs.cv_results_['params']):
#         print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [32]:
# # TF_IDF_N_Grams

# x = 0
# for model in models:
#     gs = GridSearchCV(estimator=model,
#                         param_grid=params_list[x],
#                         scoring='accuracy',
#                         verbose=1,
#                         n_jobs=-1).fit(tf_idf_n_grams(train_data, train_data), train_data['Feeling'])
#     x += 1
#     y_train = gs.predict(tf_idf_n_grams(train_data, train_data))
#     y_predict = gs.predict(tf_idf_n_grams(train_data, val_data))

#     print(f"Training Accuracy of {model} using tf_idf_n_grams %.3f" 
#     %accuracy_score(train_data['Feeling'], y_train))

#     print(f"Training F1 of {model} using tf_idf_n_grams %.3f" 
#     %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

#     print(f"Val Accuracy of {model} using tf_idf_n_grams %.3f" 
#     %accuracy_score(val_data['Feeling'], y_predict))

#     print(f"Val F1 of {model} using tf_idf_n_grams %.3f" 
#     %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

#     print('Best parameters found:\n', gs.best_params_)

#     print('---------------------------------------------------------------------------------------------------------------------')
#     # All results
#     means = gs.cv_results_['mean_test_score']
#     stds = gs.cv_results_['std_test_score']
#     for mean, std, params in zip(means, stds, gs.cv_results_['params']):
#         print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [33]:
# # LR + BoW

# data = bag_of_words(train_data, train_data)
# eval_data = bag_of_words(train_data, val_data)

# method = "bag of words"
# model_used = 'Linear Regression'

# lr_bag_gs = GridSearchCV(estimator=lr,
#                     param_grid=params_list[0],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = lr_bag_gs.predict(data)
# y_predict = lr_bag_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', lr_bag_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = lr_bag_gs.cv_results_['mean_test_score']
# stds = lr_bag_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, lr_bag_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [34]:
# #LR + TF-IDF

# data = tf_idf(train_data, train_data)
# eval_data = tf_idf(train_data, val_data)

# method = "tf-idf"
# model_used = 'Linear Regression'

# lr_tfidf_gs = GridSearchCV(estimator=lr,
#                     param_grid=params_list[0],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = lr_tfidf_gs.predict(data)
# y_predict = lr_tfidf_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', lr_tfidf_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = lr_tfidf_gs.cv_results_['mean_test_score']
# stds = lr_tfidf_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, lr_tfidf_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [35]:
# # LR + N-Grams

# data = n_grams(train_data, train_data)
# eval_data = n_grams(train_data, val_data)

# method = "n-grams"
# model_used = 'Linear Regression'

# lr_ngrams_gs = GridSearchCV(estimator=lr,
#                     param_grid=params_list[0],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = lr_ngrams_gs.predict(data)
# y_predict = lr_ngrams_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', lr_ngrams_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = lr_ngrams_gs.cv_results_['mean_test_score']
# stds = lr_ngrams_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, lr_ngrams_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [36]:
# #LR + TD-IDF of N-Grams

# data = tf_idf_n_grams(train_data, train_data)
# eval_data = tf_idf_n_grams(train_data, val_data)

# method = "tf-idf of n-grams"
# model_used = 'Linear Regression'

# lr_ntfidf_gs = GridSearchCV(estimator=lr,
#                     param_grid=params_list[0],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = lr_ntfidf_gs.predict(data)
# y_predict = lr_ntfidf_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', lr_ntfidf_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = lr_ntfidf_gs.cv_results_['mean_test_score']
# stds = lr_ntfidf_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, lr_ntfidf_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [37]:
# # SVC + BoW

# data = bag_of_words(train_data, train_data)
# eval_data = bag_of_words(train_data, val_data)

# method = "bag of words"
# model_used = 'SVC'

# lr_bag_gs = GridSearchCV(estimator=svc,
#                     param_grid=params_list[1],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = lr_bag_gs.predict(data)
# y_predict = lr_bag_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', lr_bag_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = lr_bag_gs.cv_results_['mean_test_score']
# stds = lr_bag_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, lr_bag_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [38]:
# # SVC + BoW

# data = tf_idf(train_data, train_data)
# eval_data = tf_idf(train_data, val_data)

# method = "bag of words"
# model_used = 'SVC'

# svc_tfidf_gs = GridSearchCV(estimator=svc,
#                     param_grid=params_list[1],
#                     scoring='accuracy',
#                     verbose=1,
#                     n_jobs=-1).fit(data, train_data['Feeling'])
# y_train = svc_tfidf_gs.predict(data)
# y_predict = svc_tfidf_gs.predict(eval_data)

# print(f"Training Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(train_data['Feeling'], y_train))

# print(f"Training F1 of {model_used} using {method} %.3f" 
# %f1_score(train_data['Feeling'], y_train, average = 'weighted'))

# print(f"Val Accuracy of {model_used} using {method} %.3f" 
# %accuracy_score(val_data['Feeling'], y_predict))

# print(f"Val F1 of {model_used} using {method} %.3f" 
# %f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

# print('Best parameters found:\n', svc_tfidf_gs.best_params_)

# print('---------------------------------------------------------------------------------------------------------------------')
# # All results
# means = svc_tfidf_gs.cv_results_['mean_test_score']
# stds = svc_tfidf_gs.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, svc_tfidf_gs.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [40]:
from sklearn.linear_model import SGDClassifier

In [61]:
sgd = SGDClassifier(loss = 'log', random_state = 3)

In [62]:
# SGDCassifier + BoW

data = bag_of_words(train_data, train_data)
eval_data = bag_of_words(train_data, val_data)

method = "bag of words"
model_used = 'SGDCassifier'

fitted = sgd.fit(data, train_data['Feeling'])
y_train = fitted.predict(data)
y_predict = fitted.predict(eval_data)

print(f"Training Accuracy of {model_used} using {method} %.3f" 
%accuracy_score(train_data['Feeling'], y_train))

print(f"Training F1 of {model_used} using {method} %.3f" 
%f1_score(train_data['Feeling'], y_train, average = 'weighted'))

print(f"Val Accuracy of {model_used} using {method} %.3f" 
%accuracy_score(val_data['Feeling'], y_predict))

print(f"Val F1 of {model_used} using {method} %.3f" 
%f1_score(val_data['Feeling'], y_predict, average = 'weighted'))

Training Accuracy of SGDCassifier using bag of words 0.596
Training F1 of SGDCassifier using bag of words 0.594
Val Accuracy of SGDCassifier using bag of words 0.380
Val F1 of SGDCassifier using bag of words 0.372


# Tests on solution set

In [43]:
solutions = pd.read_csv(r"C:\Users\conde\OneDrive\OneDrive Docs\Documents\Masters\2nd semester\Text Mining\Project\TextMining\test_solution.csv")

In [65]:
test_set = get_data(r"C:\Users\conde\OneDrive\OneDrive Docs\Documents\Masters\2nd semester\Text Mining\Project\TextMining\test_set.txt")

In [66]:
test_set

Unnamed: 0,Sentence,Feeling
1,"Come , let's go get that automobile .",
2,"Well , some other time , then ?",
3,He's in trouble . Boy ?,
4,Criminal gang unit to take control .,
5,By Molto . No discussion . Interview .,
...,...,...
1996,Let's do some good .,
1997,"Hey , you're looking good .",
1998,"Have they moved the embassy , or are you hijac...",
1999,What that !,


In [68]:
test_set['Feeling'] = fitted.predict(bag_of_words(train_data, test_set))

In [69]:
test_set

Unnamed: 0,Sentence,Feeling
1,"Come , let's go get that automobile .",2
2,"Well , some other time , then ?",2
3,He's in trouble . Boy ?,4
4,Criminal gang unit to take control .,8
5,By Molto . No discussion . Interview .,1
...,...,...
1996,Let's do some good .,2
1997,"Hey , you're looking good .",5
1998,"Have they moved the embassy , or are you hijac...",1
1999,What that !,7
