In [1]:
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import random
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
test_df=pd.read_csv('test_twitter_x_test.csv')
train_df=pd.read_csv('training_twitter_x_y_train.csv')

In [3]:
train_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
train_df.shape

(10980, 12)

In [5]:
train_df.isna().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [6]:
x_train=train_df['text'].values
y_train=train_df['airline_sentiment'].values

In [7]:
stops=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stops.update(punctuations)
lemmatizer=WordNetLemmatizer()

In [8]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_words=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_words.lower())
    return output_words

In [10]:
text_doc=[]
for i in x_train:
    text_doc.append(' '.join(clean_review(i.split())))
x_train=text_doc

In [11]:
count_vec=CountVectorizer(max_features=10000)
x_train_features=count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
count_vec.get_feature_names_out()

array(['00', '000', '000114', ..., 'zukes', 'zurich', 'zv2pt6trk9'],
      dtype=object)

In [13]:
x_test=test_df['text'].values
text_doc=[]
for i in x_test:
    text_doc.append(' '.join(clean_review(i.split())))
x_test=text_doc
x_test_features=count_vec.transform(x_test)
x_test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
rfc=RandomForestClassifier()
rfc.fit(x_train_features,y_train)

RandomForestClassifier()

In [15]:
rfc.score(x_train_features,y_train)

0.9966302367941712

In [16]:
print(classification_report(y_train,rfc.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      6851
     neutral       1.00      0.99      0.99      2327
    positive       0.99      1.00      0.99      1802

    accuracy                           1.00     10980
   macro avg       0.99      0.99      0.99     10980
weighted avg       1.00      1.00      1.00     10980



In [17]:
etc=ExtraTreesClassifier()
etc.fit(x_train_features,y_train)

ExtraTreesClassifier()

In [18]:
etc.score(x_train_features,y_train)

0.9966302367941712

In [19]:
print(classification_report(y_train,etc.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      6851
     neutral       1.00      0.99      0.99      2327
    positive       0.99      0.99      0.99      1802

    accuracy                           1.00     10980
   macro avg       0.99      0.99      0.99     10980
weighted avg       1.00      1.00      1.00     10980



In [20]:
abc=AdaBoostClassifier()
abc.fit(x_train_features,y_train)

AdaBoostClassifier()

In [21]:
abc.score(x_train_features,y_train)

0.7235883424408015

In [22]:
print(classification_report(y_train,abc.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       0.74      0.93      0.82      6851
     neutral       0.59      0.21      0.31      2327
    positive       0.70      0.61      0.65      1802

    accuracy                           0.72     10980
   macro avg       0.68      0.58      0.60     10980
weighted avg       0.70      0.72      0.69     10980



In [23]:
gbc=GradientBoostingClassifier()
gbc.fit(x_train_features,y_train)

GradientBoostingClassifier()

In [24]:
gbc.score(x_train_features,y_train)

0.7454462659380692

In [25]:
print(classification_report(y_train,gbc.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       0.74      0.96      0.84      6851
     neutral       0.78      0.22      0.35      2327
    positive       0.75      0.59      0.66      1802

    accuracy                           0.75     10980
   macro avg       0.76      0.59      0.62     10980
weighted avg       0.75      0.75      0.71     10980



In [26]:
svm=SVC()
svm.fit(x_train_features,y_train)

SVC()

In [27]:
svm.score(x_train_features,y_train)

0.9180327868852459

In [28]:
print(classification_report(y_train,svm.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       0.93      0.98      0.95      6851
     neutral       0.88      0.81      0.84      2327
    positive       0.93      0.83      0.87      1802

    accuracy                           0.92     10980
   macro avg       0.91      0.87      0.89     10980
weighted avg       0.92      0.92      0.92     10980



In [29]:
lr=LogisticRegression()
lr.fit(x_train_features,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [30]:
lr.score(x_train_features,y_train)

0.9393442622950819

In [31]:
print(classification_report(y_train,lr.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       0.95      0.98      0.96      6851
     neutral       0.91      0.85      0.88      2327
    positive       0.93      0.91      0.92      1802

    accuracy                           0.94     10980
   macro avg       0.93      0.91      0.92     10980
weighted avg       0.94      0.94      0.94     10980



In [32]:
sgd=SGDClassifier()
sgd.fit(x_train_features,y_train)

SGDClassifier()

In [33]:
sgd.score(x_train_features,y_train)

0.9493624772313297

In [34]:
print(classification_report(y_train,sgd.predict(x_train_features)))

              precision    recall  f1-score   support

    negative       0.96      0.98      0.97      6851
     neutral       0.94      0.86      0.90      2327
    positive       0.94      0.94      0.94      1802

    accuracy                           0.95     10980
   macro avg       0.94      0.93      0.93     10980
weighted avg       0.95      0.95      0.95     10980



In [35]:
knn=KNeighborsClassifier()
knn.fit(x_train_features,y_train)

KNeighborsClassifier()

In [36]:
knn.score(x_train_features,y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.6676684881602915

In [37]:
print(classification_report(y_train,knn.predict(x_train_features)))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score   support

    negative       0.90      0.64      0.75      6851
     neutral       0.42      0.78      0.55      2327
    positive       0.62      0.62      0.62      1802

    accuracy                           0.67     10980
   macro avg       0.65      0.68      0.64     10980
weighted avg       0.75      0.67      0.69     10980



In [38]:
g_naive_bayes=GaussianNB()
g_naive_bayes.fit(x_train_features.toarray(),y_train)

GaussianNB()

In [39]:
g_naive_bayes.score(x_train_features.toarray(),y_train)

0.7400728597449909

In [40]:
print(classification_report(y_train,g_naive_bayes.predict(x_train_features.toarray())))

              precision    recall  f1-score   support

    negative       1.00      0.69      0.81      6851
     neutral       0.68      0.69      0.68      2327
    positive       0.46      1.00      0.63      1802

    accuracy                           0.74     10980
   macro avg       0.71      0.79      0.71     10980
weighted avg       0.84      0.74      0.76     10980



In [41]:
m_naive_bayes=MultinomialNB()
m_naive_bayes.fit(x_train_features.toarray(),y_train)

MultinomialNB()

In [42]:
m_naive_bayes.score(x_train_features.toarray(),y_train)

0.8453551912568306

In [43]:
print(classification_report(y_train,m_naive_bayes.predict(x_train_features.toarray())))

              precision    recall  f1-score   support

    negative       0.84      0.96      0.90      6851
     neutral       0.83      0.59      0.69      2327
    positive       0.87      0.74      0.80      1802

    accuracy                           0.85     10980
   macro avg       0.85      0.76      0.80     10980
weighted avg       0.84      0.85      0.84     10980



In [44]:
y_pred=etc.predict(x_test_features)
ser=pd.Series(y_pred)
ser.to_csv('y_pred.csv',header=None,index=False)