In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

## Twitter Sentiment Analysis

This contest is taken from the real task of Text Processing.

The task is to build a model that will determine the tone (neutral, positive, negative) of the text. To do this, you will need to train the model on the existing data (train.csv). The resulting model will have to determine the class (neutral, positive, negative) of new texts (test data that were not used to build the model) with maximum accuracy.

In [2]:
tweets = pd.read_csv("C:/Users/heera/Desktop/GLabs_DSMP_New-masters/twitter/train.csv",encoding='latin')

In [3]:
tweets

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...
...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hop...
99985,99997,1,@cupcake__ arrrr we both replied to each other...
99986,99998,0,@CuPcAkE_2120 ya i thought so
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...


#### Questions for EDA

1. Total words in corpus
2. Total unique words in corpus without pre-processing
3. Total unique words in corpus after lowercase
4. Total unique words in corpus after lowercase, stemming / lemmatization
5. Total unique words in corpus after lowercase, stemming / lemmatization and stopwords removal
6. Prepricessing steps - CHALLENGE

In [4]:
# Total words in corpus

def total_tokens_raw(tweet):
    """
    Find total raw tokens
    
    Args:
    tweet - str - tweet text
    
    Returns:
    int - count of tokens in the tweet
    """
    
    return len(word_tokenize(tweet))

In [5]:
# Add count of raw tokens in twitter DF

tweets['raw_tokens_count'] = tweets['SentimentText'].map(total_tokens_raw)

In [6]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentText,raw_tokens_count
0,1,0,is so sad for my APL frie...,12
1,2,0,I missed the New Moon trail...,7
2,3,1,omg its already 7:30 :O,6
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...,31
4,5,0,i think mi bf is cheating on me!!! ...,12


In [7]:
# Total tokens in the corpus

print('Total raw tokens are - {}'.format(sum(tweets['raw_tokens_count'])))

Total raw tokens are - 1720314


In [8]:
# Total unique words in corpus

"""
Steps:
------

1. Combining all the tweets
2. Work tokenization on the tweets
3. Create a set
4. Find lenth of the set
"""

# 1. Combining all the tweets
combined_tweets = ' '.join(tweets['SentimentText'])

# 2. Work tokenization on the tweets
tokenized_tweets = word_tokenize(combined_tweets)

# 3. Create a set
set_tokenized_tweets = set(tokenized_tweets)

# 4. Find lenth of the set

print('Total unqiuq tokens are - {}'.format(len(set_tokenized_tweets)))

Total unqiuq tokens are - 133560


In [9]:
# Total unique words in corpus after lowercase

print('Total unqiue lower-case tokens are - {}'.format(len(set(word_tokenize(combined_tweets.lower())))))

Total unqiue lower-case tokens are - 117940


In [10]:
# Instantiate PorterStemmer

ps = PorterStemmer()

In [11]:
# Lower case and stem the tweet

def lower_stem_tweet(tweet):
    """
    Lower cases and applies stemming
    
    Args:
    tweet - str - tweet text
    
    Returns:
    str - lower cased and stemmed tweet
    """
    
    return ' '.join([ps.stem(tok) for tok in word_tokenize(tweet.lower())])

In [12]:
# Total unique words in corpus after lowercase and stemming

# Lower cased and stemmed tweets
combined_lc_st_tweets = ' '.join([lower_stem_tweet(tweet) for tweet in tweets['SentimentText']])

print('Total unqiue lower-case and stemmed tokens are - {}'.format(len(set(word_tokenize(combined_lc_st_tweets)))))

Total unqiue lower-case and stemmed tokens are - 107045


In [13]:
# Load english stop words

stops = (stopwords.words('english'))

In [14]:
# Total unique words in corpus after lowercase, stemming and stop words removal

print('Total unqiue lower-case, stemmed and stop words excludedf tokens are - {}'.format(len(set([w for w in word_tokenize(combined_lc_st_tweets) if w not in stops]))))

Total unqiue lower-case, stemmed and stop words excludedf tokens are - 106919


#### Pre-processing for twitter data

1. Hashtags
2. Mentions (@)
3. Tagging ID
4. Numbers
5. Punctuation & Special Characters
6. Smileys
7. Emojis
8. Links (http:// or https://) & Short links (t.co)
9. HTML tags
10. Timestamp
11. Dates
12. Images (<img alt=)

In [15]:
for match in re.finditer('^\d+\s|\s\d+$|\s\d+\s', 'this is me @sagar21. how are you? this is @amit. #learning 1234 on 21/12/2019 at t.co/www.ga.com', flags=re.M):
    print(match.group())

 1234 


In [16]:
re.sub('@\w+', '__mention__', 'this is me @sagar21. how are you? this is @amit')

'this is me __mention__. how are you? this is __mention__'

In [17]:
# Identifying hashtags, mentions and links

def normalize_tweet(tweet):
    """
    Lower cases and normalizes tweet
    
    Args:
    tweet - str - tweet text
    
    Returns:
    str - lower cased and stemmed tweet
    """
    
    # Lower case the tweet
    tweet = tweet.lower()
    
    # Substitute mentions
    tweet = re.sub('@\w+', '__mention__', tweet)
    
    # Substitute hashtags
    tweet = re.sub('#\w+', '__hashtag__', tweet)
    
    # Substitute dates
    tweet = re.sub('\d\d\/\d\d\/\d\d\d\d', '__date__', tweet)
    
    # Substitute links
    tweet = re.sub('http.*|https.*|t.co\/.*', '__link__', tweet)
    
    # Substitute numbers
    tweet = re.sub('^\d+\s|\s\d+$|\s\d+\s', '__number__', tweet)
    
    return tweet

In [18]:
tweets['norm_tweet'] = tweets['SentimentText'].map(normalize_tweet)

In [19]:
tweets[tweets['SentimentText'].str.contains('#')]

Unnamed: 0,ItemID,Sentiment,SentimentText,raw_tokens_count,norm_tweet
131,132,0,#3turnoffwords this shit sucks,5,__hashtag__ this shit sucks
132,133,0,#asylm J2 panel is over. Guess it's back to n...,15,__hashtag__ j2 panel is over. guess it's back...
133,134,1,#poemsunder140 ....started by @shannonelyse1,7,__hashtag__ ....started by __mention__
134,135,0,#squarespace brighten my bad day! i never win...,12,__hashtag__ brighten my bad day! i never win ...
135,136,0,"#Susan Boyle didnt win! mh well, diversity wa...",18,"__hashtag__ boyle didnt win! mh well, diversi..."
...,...,...,...,...,...
99967,99979,1,@ctcash @buildingateam @diabetescure @chocolat...,23,__mention__ __mention__ __mention__ __mention_...
99971,99983,0,@CTerry1985 That's the thing; the new raft of ...,22,__mention__ that's the thing; the new raft of ...
99973,99985,1,@ctham #FollowFriday,4,__mention__ __hashtag__
99974,99986,0,@ctham #awaresg You are not wrong. But from a ...,33,__mention__ __hashtag__ you are not wrong. but...


### Sentiment Analysis Process

1. Import CountVectorizer
2. Create X & Y
3. Create train and test dataset (train_test_split required)
4. Fit & Transform vecotrs using X_train and just transform X_test
5. Fit model
6. predict
7. Print CLF report

In [20]:
cv = CountVectorizer()

In [21]:
# Create X & y

X = tweets['SentimentText']
y = tweets['Sentiment']

In [22]:
# Split data in to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)



In [23]:
# Fit and transform vectors for X_train

X_train_vec = cv.fit_transform(X_train)

In [24]:
# Transform X_test to vectors

X_test_vec = cv.transform(X_test)

In [25]:
# Instatiate models

rf = RandomForestClassifier()
lr = LogisticRegression()
vot_hard = VotingClassifier([('rf', rf), ('lr', lr)], voting='hard')
vot_soft = VotingClassifier([('rf', rf), ('lr', lr)], voting='soft')

In [26]:
# Fit and predict

print('Random Forest Results')
print('---------------------')
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Logistic Regression Results')
print('---------------------')
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Hard Results')
print('---------------------')
vot_hard.fit(X_train_vec, y_train)
y_pred = vot_hard.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Soft Results')
print('---------------------')
vot_soft.fit(X_train_vec, y_train)
y_pred = vot_soft.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

Random Forest Results
---------------------




              precision    recall  f1-score   support

           0       0.67      0.70      0.68     13064
           1       0.76      0.73      0.74     16933

   micro avg       0.72      0.72      0.72     29997
   macro avg       0.71      0.71      0.71     29997
weighted avg       0.72      0.72      0.72     29997


Logistic Regression Results
---------------------




              precision    recall  f1-score   support

           0       0.75      0.70      0.73     13064
           1       0.78      0.82      0.80     16933

   micro avg       0.77      0.77      0.77     29997
   macro avg       0.77      0.76      0.76     29997
weighted avg       0.77      0.77      0.77     29997


Voting Classifier Hard Results
---------------------




              precision    recall  f1-score   support

           0       0.66      0.82      0.73     13064
           1       0.83      0.68      0.74     16933

   micro avg       0.74      0.74      0.74     29997
   macro avg       0.75      0.75      0.74     29997
weighted avg       0.76      0.74      0.74     29997


Voting Classifier Soft Results
---------------------




              precision    recall  f1-score   support

           0       0.76      0.70      0.73     13064
           1       0.78      0.83      0.80     16933

   micro avg       0.77      0.77      0.77     29997
   macro avg       0.77      0.76      0.76     29997
weighted avg       0.77      0.77      0.77     29997




In [27]:
# Create X & y

X = tweets['norm_tweet']
y = tweets['Sentiment']

# Split data in to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

# Fit and transform vectors for X_train

X_train_vec = cv.fit_transform(X_train)

# Transform X_test to vectors

X_test_vec = cv.transform(X_test)

# Fit and predict

print('Random Forest Results')
print('---------------------')
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Logistic Regression Results')
print('---------------------')
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Hard Results')
print('---------------------')
vot_hard.fit(X_train_vec, y_train)
y_pred = vot_hard.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Soft Results')
print('---------------------')
vot_soft.fit(X_train_vec, y_train)
y_pred = vot_soft.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')



Random Forest Results
---------------------
              precision    recall  f1-score   support

           0       0.66      0.71      0.68     13064
           1       0.76      0.72      0.74     16933

   micro avg       0.71      0.71      0.71     29997
   macro avg       0.71      0.71      0.71     29997
weighted avg       0.72      0.71      0.72     29997


Logistic Regression Results
---------------------




              precision    recall  f1-score   support

           0       0.75      0.69      0.72     13064
           1       0.77      0.82      0.80     16933

   micro avg       0.76      0.76      0.76     29997
   macro avg       0.76      0.76      0.76     29997
weighted avg       0.76      0.76      0.76     29997


Voting Classifier Hard Results
---------------------




              precision    recall  f1-score   support

           0       0.65      0.82      0.73     13064
           1       0.83      0.66      0.74     16933

   micro avg       0.73      0.73      0.73     29997
   macro avg       0.74      0.74      0.73     29997
weighted avg       0.75      0.73      0.73     29997


Voting Classifier Soft Results
---------------------




              precision    recall  f1-score   support

           0       0.75      0.69      0.72     13064
           1       0.78      0.82      0.80     16933

   micro avg       0.77      0.77      0.77     29997
   macro avg       0.76      0.76      0.76     29997
weighted avg       0.77      0.77      0.77     29997




In [28]:
# CountVectorizer with N-grams
cv = CountVectorizer(ngram_range=(1, 3))

# Create X & y

X = tweets['norm_tweet']
y = tweets['Sentiment']

# Split data in to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

# Fit and transform vectors for X_train

X_train_vec = cv.fit_transform(X_train)

# Transform X_test to vectors

X_test_vec = cv.transform(X_test)

# Fit and predict

print('Random Forest Results')
print('---------------------')
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Logistic Regression Results')
print('---------------------')
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Hard Results')
print('---------------------')
vot_hard.fit(X_train_vec, y_train)
y_pred = vot_hard.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Soft Results')
print('---------------------')
vot_soft.fit(X_train_vec, y_train)
y_pred = vot_soft.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')



Random Forest Results
---------------------


KeyboardInterrupt: 

In [None]:
# CountVectorizer with N-grams
cv = CountVectorizer(ngram_range=(1, 2), stop_words='english')

# Create X & y

X = tweets['norm_tweet']
y = tweets['Sentiment']

# Split data in to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

# Fit and transform vectors for X_train

X_train_vec = cv.fit_transform(X_train)

# Transform X_test to vectors

X_test_vec = cv.transform(X_test)

# Fit and predict

print('Random Forest Results')
print('---------------------')
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Logistic Regression Results')
print('---------------------')
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Hard Results')
print('---------------------')
vot_hard.fit(X_train_vec, y_train)
y_pred = vot_hard.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')

print('Voting Classifier Soft Results')
print('---------------------')
vot_soft.fit(X_train_vec, y_train)
y_pred = vot_soft.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('')