In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')

In [3]:
train_original=train.copy()

In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [6]:
test = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/test.csv')

test_original=test.copy()

In [7]:
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


# Data Pre-processing

## 1) Combine train and test dataset

In [8]:
combine=pd.concat([train,test],axis=0)

In [9]:
combine.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [10]:
combine.tail()

Unnamed: 0,id,label,tweet
17192,49155,,thought factory: left-right polarisation! #tru...
17193,49156,,feeling like a mermaid ð #hairflip #neverre...
17194,49157,,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,,"happy, at work conference: right mindset leads..."
17196,49159,,"my song ""so glad"" free download! #shoegaze ..."


## 2) Removing Twitter handles (@user) from the tweets

In [11]:
import re

In [12]:
def remove(text,pattern):
    match=re.findall(pattern,text)
    
    for i in match:
        text=text.replace(i,"")
    return text

In [13]:
combine['tweets_org']=np.vectorize(remove)(combine['tweet'],r"@[\w]*")

In [14]:
combine.head()

Unnamed: 0,id,label,tweet,tweets_org
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


## 3) Removing Punctuations, numbers and special characters

In [15]:
combine['tweets_org']=combine['tweets_org'].str.replace(r'[^A-Za-z#]'," ")

  combine['tweets_org']=combine['tweets_org'].str.replace(r'[^A-Za-z#]'," ")


In [16]:
combine.head()

Unnamed: 0,id,label,tweet,tweets_org
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation


## 4) Removing short words

In [17]:
def remove_short_words(text):
    temp=text.split()
    ans=""
    for i in temp:
        if len(i)>3:
            ans+=" "+i
    return ans

In [18]:
combine['tweets_org']=combine['tweets_org'].apply(remove_short_words)

In [19]:
combine.head(10)

Unnamed: 0,id,label,tweet,tweets_org
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids ...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelcha...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fare talking before they leave chaos dis...
6,7,0.0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0.0,the next school year is the year for exams.ð...,next school year year exams think about that ...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,love land #allin #cavs #champions #cleveland ...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...,welcome here


## 5) Making list of words of tweets and stemming

In [20]:
tokenized_tweets= combine['tweets_org'].apply(lambda x: x.split())
tokenized_tweets.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tweets_org, dtype: object

In [21]:
import nltk

In [22]:
from nltk.stem import PorterStemmer

In [23]:
def stemming_words(text):
    st=PorterStemmer()
    for i in range(len(text)):
        text[i]=st.stem(text[i])
        
    return text

In [24]:
tokenized_tweets=tokenized_tweets.apply(stemming_words)

In [25]:
tokenized_tweets.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: tweets_org, dtype: object

In [26]:
def stitch_back(text):
    text=' '.join(text)
    
    return text

In [27]:
tokenized_tweets=tokenized_tweets.apply(stitch_back)

In [28]:
tokenized_tweets.head()

0    when father dysfunct selfish drag kid into dys...
1    thank #lyft credit caus they offer wheelchair ...
2                                  bihday your majesti
3                           #model love take with time
4                             factsguid societi #motiv
Name: tweets_org, dtype: object

In [29]:
combine['tweets_org']=tokenized_tweets
combine.head()

Unnamed: 0,id,label,tweet,tweets_org
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0.0,bihday your majesty,bihday your majesti
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguid societi #motiv
