# EDA Notebook

In [1]:
import os
import sys

app_path = os.getcwd().rsplit(os.sep, 2)[0]

"""Add app path to sys.path for importing parent directory modules"""
if app_path not in sys.path:
    sys.path.insert(0, app_path)

# Note: Do NOT delete this cell

## Imports

In [2]:
import pandas as pd
import time

from functools import partial

from utils.paths import data_path
from utils.helpers import preprocess, get_hashtags, save_as_csv, get_subjectivity, get_polarity

## Reading raw data

In [3]:
FILE = "test"

if FILE == "train":
    raw_data = data_path('train.csv')

if FILE == "test":
    raw_data = data_path('test.csv')

raw_df   = pd.read_csv(raw_data)

pro_df   = raw_df  # Creating a processed df for storing processed raw df 

raw_df.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1



## Creating hashtags column

In [4]:
pro_df['hashtags'] = raw_df.tweet.apply(get_hashtags, 1)
pro_df.hashtags

0                           sxswnui sxsw apple
1                            googledoodle sxsw
2                                         sxsw
3                    sxsw fuckit illmakeitwork
4                                         sxsw
                         ...                  
7269                                      sxsw
7270    edchat musedchat sxsw sxswi newtwitter
7271                                      sxsw
7272                                      sxsw
7273                         bankinnovate sxsw
Name: hashtags, Length: 7274, dtype: object

## Preprocessing all tweets to remove links, @mentions, numbers and special characters

In [5]:
start = time.time()

preprocess_lema = partial(preprocess, lema=True)
pro_df.tweet    = raw_df.tweet.apply(preprocess_lema, 1)

end = time.time()
print("Time elapsed: {}".format(end-start))

pro_df.head()

Time elapsed: 7.663507699966431


Unnamed: 0,tweet_id,tweet,sentiment,hashtags
0,1701,sxswnui sxsw apple define language touch diffe...,1,sxswnui sxsw apple
1,1851,learn ab google doodle doodle light funny amp ...,1,googledoodle sxsw
2,2689,one face ex steal show yrs mention quot sxsw a...,2,sxsw
3,4525,iphone sxsw app would b pretty awesome crash e...,0,sxsw fuckit illmakeitwork
4,3604,line outside apple store austin wait new ipad ...,1,sxsw


## Subjectivity and Polarity columns

In [6]:
pro_df['subjectivity'] = pro_df.tweet.apply(get_subjectivity, 1)
pro_df['polarity'] = pro_df.tweet.apply(get_polarity, 1)

pro_df

Unnamed: 0,tweet_id,tweet,sentiment,hashtags,subjectivity,polarity
0,1701,sxswnui sxsw apple define language touch diffe...,1,sxswnui sxsw apple,0.550000,0.000000
1,1851,learn ab google doodle doodle light funny amp ...,1,googledoodle sxsw,0.893750,0.381250
2,2689,one face ex steal show yrs mention quot sxsw a...,2,sxsw,0.000000,0.000000
3,4525,iphone sxsw app would b pretty awesome crash e...,0,sxsw fuckit illmakeitwork,1.000000,0.625000
4,3604,line outside apple store austin wait new ipad ...,1,sxsw,0.252273,0.068182
...,...,...,...,...,...,...
7269,3343,google plze tammi middle sxsw craziness everyt...,1,sxsw,0.150000,0.050000
7270,5334,mention set link edchat musedchat sxsw sxswi n...,1,edchat musedchat sxsw sxswi newtwitter,0.000000,0.000000
7271,5378,mention aha find proof lactation room excuse q...,1,sxsw,0.058333,-0.025000
7272,2173,launch ipad app sxsw get detail first edition ...,1,sxsw,0.566667,0.325000


## Vader Analyses

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_neg_polarity(text: str) -> int:
    return analyser.polarity_scores(text)['neg']

def get_neu_polarity(text: str) -> int:
    return analyser.polarity_scores(text)['neu']

def get_pos_polarity(text: str) -> int:
    return analyser.polarity_scores(text)['pos']

def get_compound_polarity(text: str) -> int:
    return analyser.polarity_scores(text)['compound']

In [8]:
pro_df['negative'] = pro_df.tweet.apply(get_neg_polarity, 1)
pro_df['positive'] = pro_df.tweet.apply(get_pos_polarity, 1)
pro_df['neutral']  = pro_df.tweet.apply(get_neu_polarity, 1)
pro_df['compound'] = pro_df.tweet.apply(get_compound_polarity, 1)

pro_df.head()


Unnamed: 0,tweet_id,tweet,sentiment,hashtags,subjectivity,polarity,negative,positive,neutral,compound
0,1701,sxswnui sxsw apple define language touch diffe...,1,sxswnui sxsw apple,0.55,0.0,0.0,0.0,1.0,0.0
1,1851,learn ab google doodle doodle light funny amp ...,1,googledoodle sxsw,0.89375,0.38125,0.0,0.409,0.591,0.765
2,2689,one face ex steal show yrs mention quot sxsw a...,2,sxsw,0.0,0.0,0.186,0.0,0.814,-0.4939
3,4525,iphone sxsw app would b pretty awesome crash e...,0,sxsw fuckit illmakeitwork,1.0,0.625,0.124,0.415,0.461,0.743
4,3604,line outside apple store austin wait new ipad ...,1,sxsw,0.252273,0.068182,0.0,0.0,1.0,0.0


## POS Tagging

In [9]:
import nltk

# sent = "Brilliant i hope they have plenty of iPad RT Apple to Open Pop Up Shop at SXSW REPORT sxsw"

# tokens = nltk.word_tokenize(sent.lower())
# print(tokens)

# nltk.pos_tag(tokens)



## Processed DataFrame to CSV

In [10]:

if FILE == "train":
    save_as_csv(pro_df, "train-preprocessed.csv")

if FILE == "test":
    save_as_csv(pro_df, "test-preprocessed.csv")
    

Generating CSV at c:\Users\karan\Desktop\GLabs_DSMP\Twitter-Sentiment-Analysis\notebooks\data\train-preprocessed.csv
Done
