#**Twitter Sentiment Analysis**


### 1. Importing all neccessary libraries

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### 2. Loading the dataset

In [None]:
train = pd.read_csv("/content/drive/My Drive/NLP/train_E6oV3lV.csv")
test = pd.read_csv("/content/drive/My Drive/NLP/test_tweets_anuFYb8.csv")
sample = pd.read_csv("/content/drive/My Drive/NLP/sample_submission_gfvA5FD.csv")

In [None]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [None]:
sample.head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [None]:
train.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

### 3. Text-preprocessing

In [None]:
stopwords = set(stopwords.words('english'))
stop = [x.lower() for x in stopwords]
lemma = WordNetLemmatizer()

In [None]:
shortcuts = {'u': 'you', 'y': 'why', 'r': 'are', 'doin': 'doing', 'hw': 'how', 'k': 'okay', 'm': 'am', 'b4': 'before',
'idc': "i do not care", 'ty': 'thankyou', 'wlcm': 'welcome', 'bc': 'because', '<3': 'love', 'xoxo': 'love',
'ttyl': 'talk to you later', 'gr8': 'great', 'bday': 'birthday', 'awsm': 'awesome', 'gud': 'good', 'h8': 'hate',
'lv': 'love', 'dm': 'direct message', 'rt': 'retweet', 'wtf': 'hate', 'idgaf': 'hate',
'irl': 'in real life', 'yolo': 'you only live once'}

In [None]:
def clean(text):
  text = text.lower()
  # keep alphanumeric characters only
  text = re.sub('\W+', ' ', text).strip()
  text = text.replace('user', '')
  # tokenize
  text_token = word_tokenize(text)
  # replace shortcuts using dict
  full_words = []
  for token in text_token:
    if token in shortcuts.keys():
      token = shortcuts[token]
    full_words.append(token)
  words_alpha = [re.sub(r'\d+', '', word) for word in full_words]
  words_big = [word for word in words_alpha if len(word)>2]
  stemmed_words = [lemma.lemmatize(word) for word in words_big]
  # join list elements to string
  clean_text = " ".join(stemmed_words)
  clean_text = clean_text.replace(' ', ' ')
  clean_text = clean_text.replace(' ', ' ')
  return clean_text

In [None]:
X_train = train.tweet
y = train.label
X_test = test.tweet

In [None]:
clean_Xtrain = X_train.apply(lambda x: clean(x))
clean_Xtest = X_test.apply(lambda x: clean(x))

### 4. Text Vectorization using Bow and tfidf

In [None]:
bow = CountVectorizer(max_df=0.5)
X_bow = bow.fit_transform(clean_Xtrain)
X_test_bow = bow.transform(clean_Xtest)

In [None]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(clean_Xtrain)
X_test_tfidf = tfidf.transform(clean_Xtest)

### 5. Train-test split

In [None]:
# split data
X_train_bow, X_val_bow, y_train_bow, y_val_bow = train_test_split(X_bow, y, test_size=0.20, random_state=0)

In [None]:
# split data
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_tfidf, y, test_size=0.20, random_state=0)

### 6. Model Evaluation


> a) Logistic Regression



In [None]:
lr = LogisticRegression()
lr.fit(X_train_bow,y_train_bow)
lr_y_pred_bow = lr.predict(X_val_bow)
print('Accuracy:', accuracy_score(lr_y_pred_bow, y_val_bow))
print("F1 Score: ", f1_score(lr_y_pred_bow, y_val_bow))

Accuracy: 0.9627717816361645
F1 Score:  0.6404833836858006


In [None]:
lr = LogisticRegression()
lr.fit(X_train_tfidf,y_train_tfidf)
lr_y_pred_tfidf = lr.predict(X_val_tfidf)
print('Accuracy:', accuracy_score(lr_y_pred_tfidf, y_val_tfidf))
print("F1 Score: ", f1_score(lr_y_pred_tfidf, y_val_tfidf))

Accuracy: 0.9519787267323635
F1 Score:  0.42830540037243947




> b) Linear SVM



In [None]:
svc = LinearSVC()
svc.fit(X_train_bow,y_train_bow)
svc_y_pred_bow = svc.predict(X_val_bow)
print('Accuracy:', accuracy_score(svc_y_pred_bow, y_val_bow))
print("F1 Score: ", f1_score(svc_y_pred_bow, y_val_bow))

Accuracy: 0.9629282027217269
F1 Score:  0.6810228802153433


In [None]:
svc = LinearSVC()
svc.fit(X_train_tfidf,y_train_tfidf)
svc_y_pred_tfidf = svc.predict(X_val_tfidf)
print('Accuracy:', accuracy_score(svc_y_pred_tfidf, y_val_tfidf))
print("F1 Score: ", f1_score(svc_y_pred_tfidf, y_val_tfidf))

Accuracy: 0.9662130455185359
F1 Score:  0.6727272727272727




> c) Naive-bayes



In [None]:
nb = MultinomialNB()
nb.fit(X_train_bow,y_train_bow)
nb_y_pred_bow = nb.predict(X_val_bow)
print('Accuracy:', accuracy_score(nb_y_pred_bow, y_val_bow))
print("F1 Score: ", f1_score(nb_y_pred_bow, y_val_bow))

Accuracy: 0.9457218833098702
F1 Score:  0.5475880052151239


In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf,y_train_tfidf)
nb_y_pred_tfidf = nb.predict(X_val_tfidf)
print('Accuracy:', accuracy_score(nb_y_pred_tfidf, y_val_tfidf))
print("F1 Score: ", f1_score(nb_y_pred_tfidf, y_val_tfidf))

Accuracy: 0.9421241983419365
F1 Score:  0.17040358744394618




> d) Stochastic gradient descent



In [None]:
sgd = SGDClassifier()
sgd.fit(X_train_bow,y_train_bow)
sgd_y_pred_bow = sgd.predict(X_val_bow)
print('Accuracy:', accuracy_score(sgd_y_pred_bow, y_val_bow))
print("F1 Score: ", f1_score(sgd_y_pred_bow, y_val_bow))

Accuracy: 0.9644924135773503
F1 Score:  0.6676427525622255


In [None]:
sgd = SGDClassifier()
sgd.fit(X_train_tfidf,y_train_tfidf)
sgd_y_pred_tfidf = sgd.predict(X_val_tfidf)
print('Accuracy:', accuracy_score(sgd_y_pred_tfidf, y_val_tfidf))
print("F1 Score: ", f1_score(sgd_y_pred_tfidf, y_val_tfidf))

Accuracy: 0.9563585171281088
F1 Score:  0.5008944543828264


In [None]:
df = pd.DataFrame()
df['y_pred'] = svc_y_pred_tfidf
df['y_pred'].value_counts()

0    6141
1     252
Name: y_pred, dtype: int64

In [None]:
# train model with full data and predict for new samples
svc.fit(X, y)
svc_y_pred_tfidf = sgd.predict(X_test_tfidf)

In [None]:
sample['label'] = svc_y_pred_tfidf
sample.to_csv('/content/drive/My Drive/NLP/submission.csv', index=False)