In [69]:
import numpy as np
import pandas as pd
import nltk
import re
import string
import matplotlib.pyplot as plt
#methods and stopwords text processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn.model_selection import train_test_split

# Machine learning Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [24]:
data=pd.read_csv('twitter_train.csv')
data.airline_sentiment

0        negative
1        positive
2        positive
3        negative
4        negative
           ...   
10975     neutral
10976    positive
10977    negative
10978    negative
10979    negative
Name: airline_sentiment, Length: 10980, dtype: object

### Get rid Unwanted columns


In [5]:
def delete_cols(df,cols):
    for col in cols:
        del df[col]
    return df      

###  Preprocessing the Text

In [37]:
# Creating a stopwords set
sw=set(stopwords.words('english'))
def preprocess_tweet_text(tweet):
    # convert all the text lowercase
    tweet=tweet.lower()
    
    # remove any urls
    tweet=re.sub(r"http\S+|www\S+|http\S+","",tweet, flags=re.MULTILINE)
    
    #remove puncutations
    tweet =tweet.translate(str.maketrans("","",string.punctuation))
    
    #remove user @ References and "#" ,Numbers from tweet
    tweet=re.sub(r'\@\w+|\#',"",tweet)
    #remove stopwords
    tweet_tokens=word_tokenize(tweet)
    filtered_words=[word for word in tweet_tokens if word not in sw]
    
    # Stemming
    ps= PorterStemmer()
    stemmed_words=[ps.stem(w) for w in filtered_words]
    
    #lemmatizing
    lemmatizer=WordNetLemmatizer()
    lemma_words=[lemmatizer.lemmatize(w,pos='a') for w in stemmed_words]
    
    return " ".join(lemma_words)

In [35]:
drop_cols = ['airline_sentiment_gold','name','negativereason_gold','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location']

delete_cols(data,drop_cols)

Unnamed: 0,airline_sentiment,airline,text
0,negative,Southwest,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,Southwest,@SouthwestAir seeing your workers time in and ...
2,positive,United,@united Flew ORD to Miami and back and had gr...
3,negative,Southwest,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,United,@united so our flight into ORD was delayed bec...
...,...,...,...
10975,neutral,American,@AmericanAir followback
10976,positive,United,@united thanks for the help. Wish the phone re...
10977,negative,US Airways,@usairways the. Worst. Ever. #dca #customerser...
10978,negative,US Airways,@nrhodes85: look! Another apology. DO NOT FLY ...


In [38]:
data_index = data.index
for index, row in data.iterrows():
    tweet=row.text
    tweet=preprocess_tweet_text(tweet)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet

In [40]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

for index, row in data.iterrows():
    row.text = deEmojify(row.text)


In [41]:
data.head()

Unnamed: 0,airline_sentiment,airline,text
0,negative,Southwest,southwestair schedul morn 2 day fact yesnot s...
1,positive,Southwest,southwestair see worker time time go beyond l...
2,positive,United,unit flew ord miami back great crew servic le...
3,negative,Southwest,southwestair dultch97 that hors radish South...
4,negative,United,unit flight ord delay air forc one last fligh...


In [55]:
x_train,x_test,y_train,y_test=model_selection.train_test_split(data,data.airline_sentiment)
print(data.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(10980, 3)
(8235, 3)
(2745, 3)
(8235,)
(2745,)


In [59]:
print(y_train.head())
x_test.head()

10343    negative
4622     negative
4782     negative
10872    negative
7704     negative
Name: airline_sentiment, dtype: object


Unnamed: 0,airline_sentiment,airline,text
6919,negative,American,americanair file loctjycqh bag airport sinc l...
5015,negative,US Airways,usairway travel 2 year old would appreci hold...
4247,negative,American,americanair wasnt offer perk usairway ask tol...
3923,negative,US Airways,usairway US Airways
10805,neutral,United,unit dm sent United


In [61]:
v = TfidfVectorizer(analyzer='word', max_features=3150, max_df = 0.8, ngram_range=(1,1))
train_features= v.fit_transform(x_train.text)
test_features=v.transform(x_test.text)

In [62]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
mnb=MultinomialNB()
# Train
mnb.fit(train_features,y_train)
#predictions
pred1=mnb.predict(test_features)


In [85]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from scikitplot.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [80]:
cnf_matrix=confusion_matrix(y_test,pred1)

print(cnf_matrix)

[[1687   22    7]
 [ 416  129   26]
 [ 284   37  137]]


In [86]:
print(classification_report(y_test,pred1))

              precision    recall  f1-score   support

    negative       0.71      0.98      0.82      1716
     neutral       0.69      0.23      0.34       571
    positive       0.81      0.30      0.44       458

    accuracy                           0.71      2745
   macro avg       0.73      0.50      0.53      2745
weighted avg       0.72      0.71      0.66      2745

