In [1]:
import nltk
import re
import pandas as pd

In [2]:
nltk.download('twitter_samples')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from nltk.corpus import twitter_samples
positive_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tokens = twitter_samples.tokenized('negative_tweets.json')

In [4]:
#parts of speech
from nltk.tag import pos_tag
position_tag = pos_tag(positive_tokens[2])
position_tag

[('@DespiteOfficial', 'JJ'),
 ('we', 'PRP'),
 ('had', 'VBD'),
 ('a', 'DT'),
 ('listen', 'VBN'),
 ('last', 'JJ'),
 ('night', 'NN'),
 (':)', 'NN'),
 ('As', 'IN'),
 ('You', 'PRP'),
 ('Bleed', 'VBP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('amazing', 'JJ'),
 ('track', 'NN'),
 ('.', '.'),
 ('When', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('in', 'IN'),
 ('Scotland', 'NNP'),
 ('?', '.'),
 ('!', '.')]

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemma = []
for word, tag in position_tag:
    lemma.append(lemmatizer.lemmatize(word, pos = 'v'))
lemma

['@DespiteOfficial',
 'we',
 'have',
 'a',
 'listen',
 'last',
 'night',
 ':)',
 'As',
 'You',
 'Bleed',
 'be',
 'an',
 'amaze',
 'track',
 '.',
 'When',
 'be',
 'you',
 'in',
 'Scotland',
 '?',
 '!']

In [7]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
without_stop = [i for i in lemma if i not in stop_words]
without_stop

['@DespiteOfficial',
 'listen',
 'last',
 'night',
 ':)',
 'As',
 'You',
 'Bleed',
 'amaze',
 'track',
 '.',
 'When',
 'Scotland',
 '?',
 '!']

In [9]:
def clean_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    clean_list = []
    for token, tag in pos_tag(tokens):
        token = token.lower()
        token = re.sub(r'@[a-z0-9_]\S+', '', token)
        token = re.sub(r'#[a-z0-9_]\S+', '', token)
        token = re.sub(r'&[a-z0-9_]\S+', '', token)
        token = re.sub(r'[?!.+,;$£%&"]+', '', token)
        token = re.sub(r'rt[\s]+', '', token)
        token = re.sub(r'\d+', '', token)
        token = re.sub(r'\$', '', token)
        token = re.sub(r'rt+', '', token)
        token = re.sub(r'https?:?\/\/\S+', '', token)
        if tag.startswith('NN'):
            position = 'n'
        elif tag.startswith('VB'):
            position = 'v'
        elif tag.startswith('RB'):
            position = 'r'
        elif tag.startswith('JJ'):
            position = 'a'
        else:
            position = 'n'

        clean_list.append(lemmatizer.lemmatize(token, pos = position))
        clean_list = [i for i in clean_list if i not in stop_words and len(i) > 0 and i != ':']

    return clean_list

In [10]:
clean_positive = list(map(clean_tokens, positive_tokens))
clean_negative = list(map(clean_tokens, negative_tokens))
print(positive_tokens[0])
print(clean_positive[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['top', 'engage', 'member', 'community', 'week', ':)']


In [11]:
def data_prepare(tokens, status):
    featureset = [(tweet, status) for tweet in tokens]
    return featureset

In [14]:
positive_featureset = data_prepare(clean_positive, 'Positive')
negative_featureset = data_prepare(clean_negative, 'Negative')

featureset = positive_featureset + negative_featureset
featureset[-5:]

[(['wanna', 'change', 'avi', 'usanele', ':('], 'Negative'),
 (['puppy', 'broke', 'foot', ':('], 'Negative'),
 (["where's", 'jaebum', 'baby', 'picture', ':(', '('], 'Negative'),
 (['mr', 'ahmad', 'maslan', 'cook', ':('], 'Negative'),
 (['hull', 'suppoer', 'expect', 'misserable', 'week', ':-('], 'Negative')]

In [15]:
features = []
labels = []

for x in featureset:
    features.append(x[0])
    labels.append(x[1])

print(features[0])
print(labels[0])


['top', 'engage', 'member', 'community', 'week', ':)']
Positive


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), sublinear_tf = True, max_features = 3000, preprocessor = ' '.join)
vectorized_features = vectorizer.fit_transform(features)

In [18]:
vocabulary = vectorizer.get_feature_names()
print(vocabulary[:15])
print(len(vocabulary))

['ab', 'able', 'able see', 'abroad', 'absolute', 'absolutely', 'abt', 'acc', 'access', 'account', 'across', 'act', 'act like', 'active', 'actually']
3000


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectorized_features, labels, test_size = 0.15, shuffle = True)

In [20]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators = 200)
classifier = text_classifier.fit(X_train, y_train)

In [21]:
predictions = classifier.predict(X_test)


In [22]:
predictions

array(['Positive', 'Positive', 'Negative', ..., 'Positive', 'Negative',
       'Negative'], dtype='<U8')

In [23]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))


              precision    recall  f1-score   support

    Negative       0.70      0.82      0.76       730
    Positive       0.80      0.67      0.73       770

    accuracy                           0.74      1500
   macro avg       0.75      0.75      0.74      1500
weighted avg       0.75      0.74      0.74      1500

0.7446666666666667


In [24]:
test_tweets = twitter_samples.tokenized('tweets.20150430-223406.json')
clean_test_tweets = list(map(clean_tokens, test_tweets))

print(test_tweets[0])
print(clean_test_tweets[0])


['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', 'the', 'EU', 'is', 'estimated', 'to', 'be', 'costing', 'Britain', '£', '170', 'billion', 'per', 'year', '!', '#BetterOffOut', '#UKIP']
['indirect', 'cost', 'uk', 'eu', 'estimate', 'cost', 'britain', 'billion', 'per', 'year']


In [25]:
df = pd.DataFrame()
df['Tweets'] = clean_test_tweets
df['Sentiment'] = classifier.predict(vectorizer.transform(list(clean_test_tweets)))
df.head(10)

Unnamed: 0,Tweets,Sentiment
0,"[indirect, cost, uk, eu, estimate, cost, brita...",Positive
1,"[video, sturgeon, post-election, deal]",Negative
2,"[economy, grow, time, faster, day, david, came...",Negative
3,"[ukip, east, lothian, candidate, look, still, ...",Negative
4,"[ukip's, housing, spokesman, rake, k, housing,...",Negative
5,"[make, sure, tune, tonight, bbc]",Positive
6,"[ed, milliband, embarrassment, would, want, re...",Negative
7,"[ft, back, tory, unrelated, note, here's, phot...",Positive
8,"[“, ed, miliband, prove, tonight, he's, job, ”...",Negative
9,"[lolz, trickle, wealth, never, trickle, past, ...",Negative
