<a href="https://colab.research.google.com/github/klnsuman/SentimentAnalysis/blob/main/SentimentAnalysis_1part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
from pathlib import Path
home_dir = Path(".").resolve()
main_dir = home_dir.parent
data_dir = str(main_dir) + "/data"

In [36]:
import os
os.chdir("/content/drive/MyDrive/Data")

In [37]:
os.listdir()

['chicago_census.csv',
 'movie_reviews.csv',
 'household_poverty.csv',
 'DTM_matrix.sav',
 'recommender_evaluate.sav',
 'reviews_clean_list_ex.sav',
 'score_labels.sav',
 'temp_heart_rate.csv',
 'NYT_article_data.csv',
 'NYT_clean_list.sav',
 'bank_clean.sav',
 'score_labels_ex.sav',
 'movies.csv',
 'users.csv',
 'movies-subset.csv',
 'lastfm_artists.csv',
 'ratings-subset.csv',
 'reviews_DTM_array.sav',
 'costa_rica_poverty.csv',
 'bank_marketing.csv',
 'bank_marketing.sav',
 'ratings.csv',
 'corrMatrix_ex.csv',
 'lastfm_ratings.csv',
 'corrMatrix.csv',
 'ex_corrMatrix.csv']

In [38]:
#=================================================-
#### Slide 10: Loading packages  ####

# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# Packages for working with text data and analyzing sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

#set up nltk packages
import nltk
nltk.download('all')
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [39]:
NYT = pd.read_csv("NYT_article_data.csv")
print(NYT.columns)

Index(['web_url', 'headline', 'snippet', 'word_count', 'source',
       'type_of_material', 'date'],
      dtype='object')


In [40]:
NYT.shape

(250, 7)

In [41]:
#=================================================-
#### Slide 16: Tokenization: split each snippet into words  ####

# Tokenize each snippet into a large list of tokenized snippets.
NYT_snippet = NYT["snippet"]
NYT_tokenized = [word_tokenize(NYT_snippet[i]) for i in range(0, len(NYT_snippet))]

In [42]:
#=================================================-
#### Slide 18: Implementing pre-processing steps on a corpus  ####

# Create a list for clean snippets.
NYT_clean = [None] * len(NYT_tokenized)
# Create a list of word counts for each clean snippet.
word_counts_per_snippet = [None] * len(NYT_tokenized)
# Process words in all snippets.
for i in range(len(NYT_tokenized)):
    # 1. Convert to lower case.
    NYT_clean[i] = [snippet.lower() for snippet in NYT_tokenized[i]]

    # 2. Remove stop words.
    stop_words = stopwords.words('english')
    NYT_clean[i] = [word for word in NYT_clean[i] if not word in stop_words]

    # 3. Remove punctuation and any non-alphabetical characters.
    NYT_clean[i] = [word for word in NYT_clean[i] if word.isalpha()]

    # 4. Stem words.
    NYT_clean[i] = [PorterStemmer().stem(word) for word in NYT_clean[i]]

    # Record the word count per snippet.
    word_counts_per_snippet[i] = len(NYT_clean[i])

In [43]:
#=================================================-
#### Slide 19: Inspect results  ####

print(NYT_clean[0][:10],type(NYT_clean[0][:10]))
print(NYT_clean[5][:10])
print(NYT_clean[10][:10])

['pakistan', 'struggl', 'batsmen', 'must', 'find', 'way', 'handl', 'south', 'africa', 'potent'] <class 'list'>
['pakistan', 'former', 'prime', 'minist', 'nawaz', 'sharif', 'appeal', 'convict', 'prison', 'sentenc']
['still', 'reckon', 'fallout', 'emmett', 'till', 'paint', 'chasten', 'artist', 'reveal', 'controversi']


In [44]:
#=================================================-
#### Slide 20: Removing empty and very short snippets  ####

print(word_counts_per_snippet[:10])


[24, 12, 19, 20, 19, 15, 23, 15, 22, 27]


In [45]:
#=================================================-
#### Slide 21: Removing empty and very short snippets (cont'd)  ####

# Convert word counts list and snippets list to numpy arrays.
word_counts_array = np.array(word_counts_per_snippet)
NYT_array = np.array(NYT_clean, dtype=object)
print(len(NYT_array))
# Find indices of all snippets where there are greater than or equal to 5 words.
valid_snippets = np.where(word_counts_array >= 5)[0]
print(len(valid_snippets))

250
248


In [52]:
np.where(word_counts_array >= 5)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
        145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
        158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
        171, 172, 173, 174, 175, 176, 177, 178, 179

In [46]:
valid_snippets

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 18

In [47]:
#=================================================-
#### Slide 22: Removing empty and very short snippets (cont'd)  ####

# Subset the NYT_array to keep only those where there are at least 5 words.
NYT_array = NYT_array[valid_snippets]
print(len(NYT_array))

# Convert the array back to a list.
NYT_clean = NYT_array.tolist()
print(NYT_clean[:3])

248
[['pakistan', 'struggl', 'batsmen', 'must', 'find', 'way', 'handl', 'south', 'africa', 'potent', 'pace', 'attack', 'claw', 'way', 'back', 'seri', 'second', 'test', 'start', 'like', 'live', 'newland', 'wicket', 'thursday'], ['nation', 'footbal', 'leagu', 'microscop', 'lack', 'minor', 'head', 'coach', 'recent', 'slew', 'fire', 'leagu'], ['hit', 'hot', 'streak', 'right', 'time', 'goal', 'golf', 'top', 'male', 'profession', 'year', 'new', 'calendar', 'cram', 'major', 'championship', 'super', 'busi', 'stretch']]


In [48]:
NYT.head(2)

Unnamed: 0,web_url,headline,snippet,word_count,source,type_of_material,date
0,https://www.nytimes.com/reuters/2019/01/01/spo...,Pakistan Look to Fix Batting Woes Against Host...,Pakistan's struggling batsmen must find a way ...,571,Reuters,News,2019-01-01
1,https://www.nytimes.com/reuters/2019/01/01/spo...,NFL: League Under Scrutiny for Lack of Minorit...,The National Football League is under the micr...,393,Reuters,News,2019-01-01


In [49]:
''' stop_words = stopwords.words('english')
def splittext(x):
  #return len([",".join(x.split(" "))])
  x = x.lower()
  x = x.split(" ")
  x_l = [i for i in x if i not in stop_words]
  return len(x_l)
NYT.snippet.apply(lambda x : splittext(x))'''

' stop_words = stopwords.words(\'english\')\ndef splittext(x):\n  #return len([",".join(x.split(" "))])\n  x = x.lower()\n  x = x.split(" ")\n  x_l = [i for i in x if i not in stop_words]\n  return len(x_l)\nNYT.snippet.apply(lambda x : splittext(x))'

In [53]:
#=================================================-
#### Slide 23: Save processed text to file using .join()  ####

# Join words in each snippet into a single character string.
NYT_clean_list = [' '.join(snippet) for snippet in NYT_clean]
print(NYT_clean_list[:5])

['pakistan struggl batsmen must find way handl south africa potent pace attack claw way back seri second test start like live newland wicket thursday', 'nation footbal leagu microscop lack minor head coach recent slew fire leagu', 'hit hot streak right time goal golf top male profession year new calendar cram major championship super busi stretch', 'pope franci usher new year ode motherhood tuesday remind faith mother exampl embrac best antidot today disjoint world solitud miseri', 'chri froom defend giro titl year choos focu win fifth tour de franc crown instead team sky announc tuesday']


In [54]:
#=================================================-
#### Slide 26: Create a DTM  ####

# Initialize `CountVectorizer`.
vec = CountVectorizer()

# Transform the list of snippets into DTM.
X = vec.fit_transform(NYT_clean_list)
print(X.toarray()) #<- to show output as a matrix

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [58]:
#=================================================-
#### Slide 38: Text classification - classify (cont'd)  ####

# Initialize the `SentimentIntensityAnalyzer().`
sid = SentimentIntensityAnalyzer()

# Iterate through each sentence printing out the scores for each.
for sentence in NYT_clean_list[:5]:
     print("\n Sen",sentence)
     ss = sid.polarity_scores(sentence)
     for k in ss:
         print('{0}: {1}, '.format(k, ss[k]), end='')


 Sen pakistan struggl batsmen must find way handl south africa potent pace attack claw way back seri second test start like live newland wicket thursday
neg: 0.112, neu: 0.797, pos: 0.091, compound: -0.1531, 
 Sen nation footbal leagu microscop lack minor head coach recent slew fire leagu
neg: 0.32, neu: 0.68, pos: 0.0, compound: -0.5719, 
 Sen hit hot streak right time goal golf top male profession year new calendar cram major championship super busi stretch
neg: 0.0, neu: 0.65, pos: 0.35, compound: 0.8225, 
 Sen pope franci usher new year ode motherhood tuesday remind faith mother exampl embrac best antidot today disjoint world solitud miseri
neg: 0.0, neu: 0.72, pos: 0.28, compound: 0.7906, 
 Sen chri froom defend giro titl year choos focu win fifth tour de franc crown instead team sky announc tuesday
neg: 0.0, neu: 0.826, pos: 0.174, compound: 0.5859, 

In [59]:
#=================================================-
#### Slide 40: Text classification - classify (cont'd)  ####

# This function outputs a list of labels for snippet:
def sentiment_analysis(texts):
        list_of_scores = []
        for text in texts:
            sid = SentimentIntensityAnalyzer()
            compound = sid.polarity_scores(text)["compound"]
            if compound >= 0:
                list_of_scores.append("positive")
            else:
                list_of_scores.append("negative")
        return(list_of_scores)
score_labels = sentiment_analysis(NYT_clean_list)
print(score_labels[1:5])

['negative', 'positive', 'positive', 'positive']


In [62]:
#=================================================-
#### Slide 42: Save results as a pickle  ####

pickle.dump(NYT_clean_list, open('/NYT_clean_list.sav', 'wb'))
pickle.dump(score_labels, open('/score_labels.sav', 'wb'))
pickle.dump(X, open('/DTM_matrix.sav', 'wb'))