In [8]:
import pandas as pd
import nltk

In [9]:
csv_file = r'C:\Users\groov\PythonCode\output\trump_tweets\trump_tweets.csv'

df = pd.read_csv(csv_file, names=['date_posted', 'tweet'])
df.tail()

Unnamed: 0,date_posted,tweet
71,2019-10-13 22:16:39+00:00,".....BY THE WAY, DON’T CALL ME AGAIN, I’LL CAL..."
72,2019-10-13 23:09:01+00:00,.@marklevinshow on @FoxNews is doing a big sho...
73,2019-10-13 23:27:49+00:00,The U.S. has the worst of the ISIS prisoners. ...
74,2019-10-14 00:10:14+00:00,Somebody please explain to Chris Wallace of Fo...
75,2019-10-14 00:34:01+00:00,“Serial killers get more Due Process than the ...


In [10]:
df.shape

(76, 2)

In [11]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
df['tokens'] = df['tweet'].apply(tokenizer.tokenize)

df['tokens'][0:5]

0    [“, There, are, no, felonies, ,, there, are, n...
1    [So, many, people, conveniently, forget, that,...
2    [..., ..good, health, ,, at, my, request, ,, P...
3    [We, may, be, in, the, process, of, leaving, S...
4    [..., .understands, that, while, we, only, had...
Name: tokens, dtype: object

In [12]:
from nltk.util import ngrams

list(ngrams(df['tokens'][0:1], 2))

[]

In [13]:
list(ngrams(df['tokens'][0], 3))[0:5]

[('“', 'There', 'are'),
 ('There', 'are', 'no'),
 ('are', 'no', 'felonies'),
 ('no', 'felonies', ','),
 ('felonies', ',', 'there')]

In [14]:
nltk.download('stopwords')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words_sklearn

stop_words_nltk = nltk.corpus.stopwords.words('english')
print(stop_words_nltk[0:5])
print(list(stop_words_sklearn)[0:5])

['i', 'me', 'my', 'myself', 'we']
['everywhere', 'himself', 'perhaps', 'those', 'formerly']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\groov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop_words = []

stop_words = stop_words + stop_words_nltk
stop_words = stop_words + list(stop_words_sklearn)
stop_words = list(set(stop_words))

stop_words_mine = ['“', '”', ',', '@', '’', '!']
stop_words = stop_words + stop_words_mine


len(stop_words), len(stop_words_nltk), len(list(stop_words_sklearn))

(384, 179, 318)

In [16]:
tokens = [token for token in df['tokens'][0] if token not in stop_words]
" ".join(tokens)

'There felonies Impeachable offenses. The Constitution clear need bribery treason high crimes misdemeanors. You impeached conduct alleged case. AlanDersh Dershowitz. seanhannity A Scam'

In [17]:
# stemming removes the small meaning differences of pluralization or posssessive endings of words to normalize vocabulary
# it can reduce the precision score of your search results, but would improve the recall score for returning relevant docs
# so it is often useful to be able to turn off stemming where you want precision

# two of the most popular stemmers are Porter and Snowball. They were both created by the scientist Martin Porter.

In [18]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

stemmed_tokens = [stemmer.stem(word) for word in tokens]

" ".join(stemmed_tokens)

'there feloni impeach offenses. the constitut clear need briberi treason high crime misdemeanors. you impeach conduct alleg case. alandersh dershowitz. seanhann A scam'

In [19]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

stemmed_tokens = [stemmer.stem(word) for word in tokens]

" ".join(stemmed_tokens)

'there feloni impeach offenses. the constitut clear need briberi treason high crime misdemeanors. you impeach conduct alleg case. alandersh dershowitz. seanhann a scam'

In [20]:
# lemmatization can associate words together by their meaning even if their spelling is different
# like stemming, this can make your model less precise but more general
# it is potentially more accurate than stemming because it takes into account word meaning
# some lemmatizers use the word's part of speech in addition to spelling to improve accuracy
# so, lemmatizers are better than stemmers for most applications.

# and if you really want the dimension reduction and recall improvement of a stemmer in your information 
# retrieval pipeline, you shoudl probably also use a lemmatizer right befor the stemmer.
# because the lemma of a word is a valid English word, stemmers work well on the output of a lemmatizer

In [21]:
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

" ".join(lemmatized_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\groov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'There felony Impeachable offenses. The Constitution clear need bribery treason high crime misdemeanors. You impeached conduct alleged case. AlanDersh Dershowitz. seanhannity A Scam'

In [22]:
" ".join(tokens) # slight difference

'There felonies Impeachable offenses. The Constitution clear need bribery treason high crimes misdemeanors. You impeached conduct alleged case. AlanDersh Dershowitz. seanhannity A Scam'

In [23]:
# when should you use a lemmatizer or a stemmer?
# stemmers are generally faster to compute and require less-complex code and datasets. but stemmers make more errors
# and stem a far greater number of words, reducing the meaning of your text much more than a lemmatizer will.

# both stemmers will reduce the vocabulary of your text and increase the ambiguity of the text. but lemmatizers do
# a better job at retaining information content of a word. so some packages such as spaCy do not provide stemming functions.

In [24]:
# VADER was one of the first rule-based sentiment analysis algorithms. 
# it stands for Valence Aware Dictionary for sEntiment Reasoning.
# nltk has an implementation of the VADER algorithm

In [26]:
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sa = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\groov\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
neg = []
neu = []
pos = []
compound = []

for tweet in df['tweet']:
    print(tweet)
    print(sa.polarity_scores(tweet))
    print()
    
    neg.append(sa.polarity_scores(tweet)['neg'])
    neu.append(sa.polarity_scores(tweet)['neu'])
    pos.append(sa.polarity_scores(tweet)['pos'])
    compound.append(sa.polarity_scores(tweet)['compound'])

In [None]:
sa_df = pd.DataFrame({'neg': neg, 'neu': neu, 'pos': pos, 'compound': compound, 'tweet': df['tweet']})
sa_df.head()

In [None]:
import matplotlib.pyplot
%matplotlib inline

sa_df[['pos', 'neg']].plot.bar(stacked=True, figsize=(16,6))

In [None]:
sa_df.describe()

In [None]:
import seaborn as sns

sns.distplot(sa_df['neg'])

In [None]:
sns.distplot(sa_df['pos'])

In [None]:
for tweet in sa_df[sa_df['pos'] > 0.3]['tweet']:
    print(tweet)
    print(sa.polarity_scores(tweet))
    print()

In [None]:
for tweet in sa_df[sa_df['neg'] > 0.3]['tweet']:
    print(tweet)
    print(sa.polarity_scores(tweet))
    print()
    
# the war one is interesting, because that's actually a very positive message

In [None]:
for tweet in sa_df[sa_df['neu'] > 0.9]['tweet']:
    print(tweet)
    print(sa.polarity_scores(tweet))
    print()