In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
import numpy as np
from joblib import dump, load


def text_processing(tweet):
    #Generating the list of words in the tweet (hastags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words) # tokenize
    new_tweet = form_sentence(tweet)
    
    #Removing stopwords, links and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word.lower() for word in clean_s.split() if word.lower() not in stopwords.words('english') and not
                     word.lower().startswith("http")]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
    
    return normalization(no_punc_tweet)



In [3]:
df = pd.read_csv('train.csv', na_values='Not Available')
labelMap = {"positive": 0, "neutral": 1, "negative": 2}
labelMap_inverted = {0: "positive", 1: "neutral", 2: "negative"}

df.loc[:, 'Label'] = df.Category
df = df.replace({"Label": labelMap})
df = df[~(df.Label=="Tweet")]
df_NaN = df[(pd.isna(df.Tweet))]
df = df[~(pd.isna(df.Tweet))]
X_train, y_train = (df['Tweet'], df['Label'])
y_train = y_train.astype('int')

In [4]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train, y_train)
dump(pipeline, 'naive_bayes.joblib') 
pipeline = load('naive_bayes.joblib') 

NameError: name 'X_test' is not defined

In [5]:
test = pd.read_csv('test.csv', na_values='Not Available', dtype={'Id': object})
test

Unnamed: 0,Id,Tweet
0,628949369883000832,dear @Microsoft the newOoffice for Mac is grea...
1,628976607420645377,@Microsoft how about you make a system that do...
2,629023169169518592,
3,629179223232479232,
4,629186282179153920,If I make a game as a #windows10 Universal App...
...,...,...
9963,,
9964,,
9965,,
9966,,


In [6]:
test = test[~(pd.isna(test.Id))]
test

Unnamed: 0,Id,Tweet
0,628949369883000832,dear @Microsoft the newOoffice for Mac is grea...
1,628976607420645377,@Microsoft how about you make a system that do...
2,629023169169518592,
3,629179223232479232,
4,629186282179153920,If I make a game as a #windows10 Universal App...
...,...,...
3995,641411385700712448,I am assembling an epic Pancake Posse for an I...
3996,641452712098406400,do you work at Ihop tomorrow @carlysunshine_
3997,635369700298498048,23 Aug 00;30 #771NAS Rescue193 returned from T...
3998,635769805279248384,


In [7]:
test_NaN = test[(pd.isna(test.Tweet))]
test = test[~(pd.isna(test.Tweet))]

In [8]:
ids_NaN = test_NaN.Id.to_list()
out_NaN = pd.DataFrame.from_dict({'Id': ids_NaN, 'Category': ["positive" for _ in ids_NaN]})
out_NaN

Unnamed: 0,Id,Category
0,629023169169518592,positive
1,629179223232479232,positive
2,631792365590695936,positive
3,633628599271190528,positive
4,636978321222467584,positive
...,...,...
355,638152771389292544,positive
356,639120359967428608,positive
357,639821082497818625,positive
358,641064131706122240,positive


In [9]:
test.Tweet

0       dear @Microsoft the newOoffice for Mac is grea...
1       @Microsoft how about you make a system that do...
4       If I make a game as a #windows10 Universal App...
5       Microsoft, I may not prefer your gaming branch...
6       @MikeWolf1980 @Microsoft I will be downgrading...
                              ...                        
3994    Anybody with a Steak &amp; Shake or IHOP move ...
3995    I am assembling an epic Pancake Posse for an I...
3996         do you work at Ihop tomorrow @carlysunshine_
3997    23 Aug 00;30 #771NAS Rescue193 returned from T...
3999    IOS 9 App Transport Security. Mm need to check...
Name: Tweet, Length: 3640, dtype: object

In [10]:
predictions_test = pipeline.predict(test.Tweet)
predictions_test

array([0, 0, 0, ..., 0, 1, 1])

In [11]:
out_test = pd.DataFrame.from_dict({'Id': test.Id, 'Category': [labelMap_inverted[elem] for elem in predictions_test]})

In [12]:
out_test

Unnamed: 0,Id,Category
0,628949369883000832,positive
1,628976607420645377,positive
4,629186282179153920,positive
5,629226490152914944,neutral
6,629345637155360768,neutral
...,...,...
3994,641371402348679168,positive
3995,641411385700712448,positive
3996,641452712098406400,positive
3997,635369700298498048,neutral


In [13]:
out = out_NaN.append(out_test)
out

Unnamed: 0,Id,Category
0,629023169169518592,positive
1,629179223232479232,positive
2,631792365590695936,positive
3,633628599271190528,positive
4,636978321222467584,positive
...,...,...
3994,641371402348679168,positive
3995,641411385700712448,positive
3996,641452712098406400,positive
3997,635369700298498048,neutral


In [14]:
out.to_csv("last_submission.csv", index=False)