# Supervised Machine Learning

In [80]:
import pandas as pd
pd.options.display.max_colwidth=100

## Using original train and eval set

In [81]:
# Open train-tweets.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/train-tweets.csv"
X_train = pd.read_csv(path)
X_train = X_train['tweet']
X_train.shape

(22987,)

In [82]:
# Open eval-tweets.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/eval-tweets.csv"
X_eval = pd.read_csv(path)
X_eval = X_eval['tweet']
X_eval.shape

(4926,)

In [83]:
# Open train-labels.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/train-labels.csv"
y_train = pd.read_csv(path)
y_train = y_train['label']
y_train.shape

(22987,)

In [84]:
# Open eval-labels.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/eval-labels.csv"
y_eval = pd.read_csv(path)
y_eval = y_eval['label']
y_eval.shape

(4926,)

In [85]:
# Vectorise document by vocab count weighted with TF-IDF and model with Random Forest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rndf", RandomForestClassifier())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
cm = pd.DataFrame(confusion_matrix(y_eval, pred))
print(cm)
print(classification_report(y_eval, pred))
prec = metrics.precision_score(y_eval, pred, average=None)
recall = metrics.recall_score(y_eval, pred, average=None)
# print(metrics.accuracy_score(y_eval, pred))



     0     1    2
0  282   670   86
1  205  1923  272
2   68   845  575
              precision    recall  f1-score   support

    negative       0.51      0.27      0.35      1038
     neutral       0.56      0.80      0.66      2400
    positive       0.62      0.39      0.48      1488

    accuracy                           0.56      4926
   macro avg       0.56      0.49      0.50      4926
weighted avg       0.57      0.56      0.54      4926

0.5643524157531465


## Splitting dataset randomly to get train and eval set

In [117]:
# Open train-tweets.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/train-tweets.csv"
tweet1 = pd.read_csv(path)
tweet1.head()

Unnamed: 0,tweet,id
0,It has been a blessing for many with no Medicaid expansion in Florida. Most jobs don't offer ins...,802334127760490496
1,"'Flashback Friday to last halloween when I was Hulk Hogan. @HulkHogan Good vibes are with you, ...",624764543663742976
2,Doctors hit campaign trail as race to medical council elections heats up https://t.co/iiFdwb9v0W...,805582613687713793
3,Is anybody going to the radio station tomorrow to see Shawn? Me and my friend may go but we woul...,637480203497832448
4,I just found out Naruto didn't become the 5th Hokage....,641096279930507265


In [118]:
tweet1.shape

(22987, 2)

In [119]:
# Check that emoticons are still in tact
tweet1.loc[56]

tweet    if tim duncan played one more season we would've been an able to call him 21 savage 😔...
id                                                                             802314709106126848
Name: 56, dtype: object

In [120]:
# Open eval-tweets.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/eval-tweets.csv"
tweet2 = pd.read_csv(path)
tweet2.drop("tweetExtra", axis=1, inplace=True)
tweet2.head()

Unnamed: 0,tweet,id
0,Today or Tomorrow night work have to been done by midnight. No shame or sharing with twilight yo...,676020703410462720
1,"'in any time passing Discussion with Christians , they always Highlight Jihad in Islam. Some of...",638664771693346816
2,Wishhhhh I was going to Jason Aldean tomorrow,631543842194501632
3,"@kenklippenstein she is obviously right. putin does the same with le pen, fpö. afd etc. in europ...",802210698931634048
4,I am thankful for @bitchy_antics for pointing out how all the animals in the shop sat up when Th...,669918243163922432


In [92]:
tweet2.shape

(4926, 2)

In [93]:
tweet = pd.concat([tweet1, tweet2], ignore_index=True)
tweet.shape

(27913, 2)

In [94]:
# Check that there is no null data in train
tweet.isnull().sum()

tweet    0
id       0
dtype: int64

In [95]:
# Open train-labels.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/train-labels.csv"
label1 = pd.read_csv(path)
label1.head()

Unnamed: 0,id,label
0,802334127760490496,negative
1,624764543663742976,positive
2,805582613687713793,neutral
3,637480203497832448,positive
4,641096279930507265,neutral


In [96]:
label1.shape

(22987, 2)

In [97]:
# Open eval-labels.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/eval-labels.csv"
label2 = pd.read_csv(path)
label2.head()

Unnamed: 0,id,label
0,676020703410462720,neutral
1,638664771693346816,neutral
2,631543842194501632,positive
3,802210698931634048,neutral
4,669918243163922432,positive


In [98]:
label2.shape

(4926, 2)

In [99]:
label = pd.concat([label1, label2], ignore_index=True)
label.shape
label.head()

Unnamed: 0,id,label
0,802334127760490496,negative
1,624764543663742976,positive
2,805582613687713793,neutral
3,637480203497832448,positive
4,641096279930507265,neutral


In [100]:
label['label'].value_counts()

neutral     13854
positive     7959
negative     6100
Name: label, dtype: int64

In [101]:
data = pd.merge(tweet, label, how='inner', on='id')
data.head()

Unnamed: 0,tweet,id,label
0,It has been a blessing for many with no Medicaid expansion in Florida. Most jobs don't offer ins...,802334127760490496,negative
1,"'Flashback Friday to last halloween when I was Hulk Hogan. @HulkHogan Good vibes are with you, ...",624764543663742976,positive
2,Doctors hit campaign trail as race to medical council elections heats up https://t.co/iiFdwb9v0W...,805582613687713793,neutral
3,Is anybody going to the radio station tomorrow to see Shawn? Me and my friend may go but we woul...,637480203497832448,positive
4,I just found out Naruto didn't become the 5th Hokage....,641096279930507265,neutral


In [102]:
X = data['tweet']
y = data['label']
X.shape == y.shape

True

In [103]:
# Split dataset into four portions
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42)

In [114]:
# Vectorise document by vocab count weighted with TF-IDF and model with Random Forest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rndf", RandomForestClassifier())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
cm = pd.DataFrame(confusion_matrix(y_eval, pred))
print(cm)
print(classification_report(y_eval, pred))
prec = metrics.precision_score(y_eval, pred, average=None)
recall = metrics.recall_score(y_eval, pred, average=None)
# print(metrics.accuracy_score(y_eval, pred))



     0     1    2
0  517  1158  177
1  319  3270  559
2  100  1304  970


In [52]:
# Vectorise document by vocab count and model with Random Forest
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("cnt", CountVectorizer()), ("rndf", RandomForestClassifier())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
# print(pd.DataFrame(confusion_matrix(y_eval, pred)))
print(classification_report(y_eval, pred))
metrics.accuracy_score(y_eval, pred)



     0     1     2
0  560  1169   112
1  316  3380   406
2   74  1341  1016
              precision    recall  f1-score   support

    negative       0.59      0.30      0.40      1841
     neutral       0.57      0.82      0.68      4102
    positive       0.66      0.42      0.51      2431

    accuracy                           0.59      8374
   macro avg       0.61      0.52      0.53      8374
weighted avg       0.60      0.59      0.57      8374



0.5918318605206592

In [53]:
# Vectorise document by vocab count weighted with TF-IDF and model with Support Vector
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rndf", LinearSVC())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
# print(pd.DataFrame(confusion_matrix(y_eval, pred)))
print(classification_report(y_eval, pred))
metrics.accuracy_score(y_eval, pred)

              precision    recall  f1-score   support

    negative       0.63      0.53      0.57      1841
     neutral       0.65      0.73      0.69      4102
    positive       0.65      0.59      0.62      2431

    accuracy                           0.64      8374
   macro avg       0.64      0.62      0.63      8374
weighted avg       0.64      0.64      0.64      8374



0.6447336995462145

In [56]:
# Vectorise document by vocab count weighted with TF-IDF and model with Support Vector
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rndf", LinearSVC())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
# print(pd.DataFrame(confusion_matrix(y_eval, pred)))
print(classification_report(y_eval, pred))
metrics.accuracy_score(y_eval, pred)

              precision    recall  f1-score   support

    negative       0.63      0.53      0.57      1841
     neutral       0.65      0.73      0.69      4102
    positive       0.65      0.59      0.62      2431

    accuracy                           0.64      8374
   macro avg       0.64      0.62      0.63      8374
weighted avg       0.64      0.64      0.64      8374



0.6447336995462145

In [54]:
# Vectorise document by vocab count and model with Support Vector
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("cnt", CountVectorizer()), ("rndf", LinearSVC())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
# print(pd.DataFrame(confusion_matrix(y_eval, pred)))
print(classification_report(y_eval, pred))
metrics.accuracy_score(y_eval, pred)

              precision    recall  f1-score   support

    negative       0.59      0.51      0.54      1841
     neutral       0.63      0.68      0.65      4102
    positive       0.60      0.60      0.60      2431

    accuracy                           0.62      8374
   macro avg       0.61      0.59      0.60      8374
weighted avg       0.61      0.62      0.61      8374



0.6153570575591115

In [59]:
# Vectorise document by vocab count weighted with TF-IDF and model with Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

pipe = Pipeline([("tfidf", TfidfVectorizer()), ("nb", MultinomialNB())])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_eval)

# Display results
print(pd.DataFrame(confusion_matrix(y_eval, pred)))
print(classification_report(y_eval, pred))
metrics.accuracy_score(y_eval, pred)

     0     1    2
0  108  1721   12
1   15  4014   73
2    3  1929  499
              precision    recall  f1-score   support

    negative       0.86      0.06      0.11      1841
     neutral       0.52      0.98      0.68      4102
    positive       0.85      0.21      0.33      2431

    accuracy                           0.55      8374
   macro avg       0.75      0.41      0.37      8374
weighted avg       0.69      0.55      0.45      8374



0.5518270838309052

In [None]:
# Open train-tweets.csv
path = "/Users/kaisoon/Google Drive/Code/Python/COMP90049_KT/MachLearning/train-tweets.csv"
tweet1 = pd.read_csv(path)
tweet1.head()