In [1]:
# Data Cleaning/processing/language parsing
import pandas as pd, spacy, nltk, re
from nltk.corpus import twitter_samples
from collections import Counter

#nltk.download()

print(twitter_samples.fileids())
neg = twitter_samples.strings('negative_tweets.json')
pos = twitter_samples.strings('positive_tweets.json')

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [7]:
# Remove mentions, hashtags, and links
def text_cleaner(text):
    clean_text = []
    for tweet in text:
        tweet = re.sub('@\S+', '', tweet)
        tweet = re.sub('http\S+', '', tweet)
        tweet = re.sub('#\S+', '', tweet)
        tweet = re.sub('\.+', '', tweet)
        tweet = re.sub(':\S+', '', tweet)
        tweet = ' '.join(tweet.split())
        clean_text.append(tweet)
    
    return clean_text

clean_pos = text_cleaner(pos[0:1000])
clean_neg = text_cleaner(neg[0:1000])

#pos_tweet = [[tweet, 'Positive'] for tweet in clean_pos]
#neg_tweet = [[tweet, 'Positive'] for tweet in clean_neg]

nlp = spacy.load('en')

pos_string = " ".join(tweet for tweet in clean_pos)
neg_string = " ".join(tweet for tweet in clean_neg)

pos_str_doc = nlp(pos_string)
neg_str_doc = nlp(neg_string)

pos_doc = [[nlp(tweet), "Positive"] for tweet in clean_pos]
neg_doc = [[nlp(tweet), "Negative"] for tweet in clean_neg]

tweet_df = pd.DataFrame(pos_doc + neg_doc)
tweet_df.head()

Unnamed: 0,0,1
0,"(for, being, top, engaged, members, in, my, co...",Positive
1,"(Hey, James, !, How, odd, Please, call, our, C...",Positive
2,"(we, had, a, listen, last, night, As, You, Ble...",Positive
3,(CONGRATS),Positive
4,"(yeaaaah, yippppy, !, !, !, my, accnt, verifie...",Positive


In [8]:
# Create features using two different NLP methods: BoW 

# Utility function to define the 2000 most common words
def bag_of_words(text):
    
    # Filter punctionation and stopwords
    allwords = [token.lemma_.lower() for token in text 
                if not token.is_punct and not token.is_stop]
    
    # Return 2000 most common words
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Create DF with features
def bow_features(sentences, common_words):
    
    # Scaffold the DF and initialize counts to 0
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_sources'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurance of words in each sentence
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert to lemmas, filter out punctuation, stopwords and uncommon words 
        words = [token.lemma_
                 for token in sentence
                 if (not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words)]
        
        # Populate rows with word counts
        for word in words:
            df.loc[i, word] += 1
        
        # Checks for hung kernel
        if i % 50 == 0:
            print("Processing row {}".format(i))
        
    return df

# Set up bags
pos_words = bag_of_words(pos_str_doc)
neg_words = bag_of_words(neg_str_doc)

# Combine two bags to create a set of unique words
common_words = set(pos_words + neg_words)

# Create dataset with features. Computationally intensive
word_counts = bow_features(tweet_df, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000
Processing row 1050
Processing row 1100
Processing row 1150
Processing row 1200
Processing row 1250
Processing row 1300
Processing row 1350
Processing row 1400
Processing row 1450
Processing row 1500
Processing row 1550
Processing row 1600
Processing row 1650
Processing row 1700
Processing row 1750
Processing row 1800
Processing row 1850
Processing row 1900
Processing row 1950


Unnamed: 0,finger,work,dry,bisexuality,ass,accumulate,eng,phantasy,em,nike,...,touch,hat,till,al,yung,properly,slot,full,text_sentence,text_sources
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(for, being, top, engaged, members, in, my, co...",Positive
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hey, James, !, How, odd, Please, call, our, C...",Positive
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(we, had, a, listen, last, night, As, You, Ble...",Positive
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(CONGRATS),Positive
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(yeaaaah, yippppy, !, !, !, my, accnt, verifie...",Positive


In [18]:
#Use the features to fit supervised learning models for each feature set to predict the category outcomes.

# BoW with random forest
import numpy as np
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_sources']
X = np.array(word_counts.drop(['text_sentence', 'text_sources'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,
                                                   random_state=0)

train = rfc.fit(X_train, Y_train)
rfc_pred = rfc.predict(X_test)
print("Training Set Score:", rfc.score(X_train, Y_train))
print("Test Set Score:", rfc.score(X_test, Y_test))
pd.crosstab(Y_test, rfc_pred)

# Bag of Words with logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, Y_train)
lr_pred = lr.predict(X_test)
print("Training Set Score:", lr.score(X_train, Y_train))
print("Test Set Score:", lr.score(X_test, Y_test))
pd.crosstab(Y_test, lr_pred)



Training Set Score: 0.9591666666666666
Test Set Score: 0.6875




Training Set Score: 0.9191666666666667
Test Set Score: 0.695


col_0,Negative,Positive
text_sources,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,306,83
Positive,161,250


In [20]:
# Create features using two different NLP methods: tf-idf.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

clean_tweet = clean_pos + clean_neg

X_train, X_test = train_test_split(clean_tweet, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, #only use words that appear at least twice
                             stop_words='english',
                             lowercase=True, #converts all words to lowercase
                             use_idf=True, #use idf as weights
                             norm=u'l1', #correction factor for document length (L1, L2, or max)
                             smooth_idf=True #Adds 1 to all doc frequencies, prevents divide by zero errors
                            )

# Apply vectorizer and split to training/test sets
tweet_tfidf = vectorizer.fit_transform(clean_tweet)
print("Number of features: %d" % tweet_tfidf.get_shape()[1])
X_train_tfidf, X_test_tfidf = train_test_split(tweet_tfidf, test_size=0.4, random_state=0)
X_train_tfidf_csr = X_train_tfidf.tocsr()

# Get number of tweets
n = X_train_tfidf_csr.shape[0]
tfidf_bytweet = [{} for _ in range(0,n)]

# Feature list
terms = vectorizer.get_feature_names()

# List feature words and tfidf scores by tweet
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bytweet[i][terms[j]] = X_train_tfidf_csr[i, j]

print("Original Sentence:", X_train[0])
print("TF-IDF Vector:", tfidf_bytweet[0])

Number of features: 1061
Original Sentence: thank you eonnie
TF-IDF Vector: {'thank': 1.0}


In [21]:
# Capture components with SVD
# Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# SVD data reducer, reducing to 130 features
svd = TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data then project
X_train_lsa = lsa.fit_transform(X_train_tfidf)

exp_var = svd.explained_variance_ratio_
total_variance = exp_var.sum()
print("Variance captured by new components: {}".format(total_variance*100))

# See what paragraphs the model considers similar
paras_by_component = pd.DataFrame(X_train_lsa, index=X_train)
for i in range(5):
    print('Component {}'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Variance captured by new components: 60.31351699537344
Component 0
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
follow &amp; follow u back    0.99333
Name: 0, dtype: float64
Component 1
heyy i miss you                     0.996829
I miss his massages                 0.996829
miss you so much xxxxxx             0.996829
i miss them so much                 0.996829
miss you                            0.996829
Miss chillin'with you               0.996829
miss you                            0.996829
i miss you                          0.996829
Already miss so much                0.996829
French mixers miss you so much 💜    0.996349
Name: 1, dtype: float64
Component 2
thank you eonnie               