# Little experiment in sentiment classification
 
 * Pick a large number of tweets that contain `:)` or `:(`
 * Train a classifier to try to distinguish among them
 * Pick the classifier's brain to see which are the distinguishing features (words associated with 'happy' and 'unhappy' tweets)
 
# Step 1: Data preparation
 
 * Let us grab some tweets
 * Filter to have either `:)` or `:(` (or any other emoji labeled by me as being happy or unhappy), and sort these tweets to separate datasets
 * Before training a classifier, we must make sure to remove the emojis themselves from the tweets!!!
     * **Why?**

In [1]:
import gzip
import json
from json.decoder import JSONDecodeError

In [3]:
counter = 0
tweets = []
with gzip.open("english_tweets_00003.json.gz", "rt") as f:
    try:
        for line in f:
            line = line.strip() # remove leading and trailing whitespoace characters (e.g. "\n")
            if not line: # empty line
                continue
            try:
                tweet = json.loads(line) # decode line as python data structure (here: dictionary)
                tweets.append(tweet)
                
                counter += 1
                if counter >= 5000000: # maximum number of tweets we are willing to read
                    break
                if counter%500000==0: # print status every 500,000 tweet
                    print(counter)
                    
            except JSONDecodeError: # broken json, data writer probably died in a middle of the writing process
                pass
    except: # broken gzip, maybe the machine was shut down or something
        pass

print("Number of tweets:", len(tweets))
print(tweets[0])


500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
Number of tweets: 5000000
{'id': 934034922288943104, 'truncated': False, 'extended_entities': {'media': [{'id': 933775395614109696, 'source_status_id_str': '933775963921330176', 'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/933775395614109696/pu/img/FXt2DTTIybRVrxnO.jpg', 'source_user_id': 61559439, 'expanded_url': 'https://twitter.com/nvidia/status/933775963921330176/video/1', 'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/933775395614109696/pu/img/FXt2DTTIybRVrxnO.jpg', 'source_status_id': 933775963921330176, 'type': 'video', 'id_str': '933775395614109696', 'sizes': {'large': {'w': 1440, 'h': 720, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'small': {'w': 680, 'h': 340, 'resize': 'fit'}, 'medium': {'w': 1200, 'h': 600, 'resize': 'fit'}}, 'indices': [32, 55], 'source_user_id_str': '61559439', 'video_info': {'aspect_ratio': [2, 1], 'duration_millis': 20087, 'variants': [{

In [5]:
import re # regular expressions

happy_emojis = [" :)", "\U0001F600", "\U0001F603", "\U0001F60D"]

unhappy_emojis= [" :(", "\U0001F641", "\U0001F61F", "\u2639", "\U0001F625", \
                 "\U0001F622", "\U0001F620", "\u2620", "\U0001F621"]

print("happy:", happy_emojis)
print("unhappy:", unhappy_emojis)

# compile lists into regular expressions for fast search
happy_regex = re.compile("|".join(re.escape(e) for e in happy_emojis))
unhappy_regex = re.compile("|".join(re.escape(e) for e in unhappy_emojis))

happy: [' :)', '😀', '😃', '😍']
unhappy: [' :(', '🙁', '😟', '☹', '😥', '😢', '😠', '☠', '😡']


In [9]:
happy_tweets = []
unhappy_tweets = []

for tweet in tweets:
    
    if "retweeted_status" in tweet:
        tweet = tweet["retweeted_status"] # get the original text rather than one with RT: marker
    
    if tweet.get("truncated"): # text field may be truncated, get the full text
        text = tweet["extended_tweet"]["full_text"]
    else:
        text = tweet["text"] # not truncated
        
    # search for happy and unhappy emojis
    if re.search(happy_regex, text) and re.search(unhappy_regex, text): # contains both, skip
        continue
    elif re.search(happy_regex, text): # happy
        happy_tweets.append(text)
    elif re.search(unhappy_regex, text): # unhappy
        unhappy_tweets.append(text)
    
print("Happy:", len(happy_tweets))
print(happy_tweets[:5])

print("Unhappy:", len(unhappy_tweets))
print(unhappy_tweets[:5])

del tweets # free memory, we do not need all 5M tweets anymore

Happy: 69337
['RED VELVET SEASON GREETING PREVIEW 😍 https://t.co/4K6n1CnUik', '@fukayaqui @DonnaFins @redsand2 @SewingAngela @rosevine3 @kigi_ebooks @awlasky @mank56 Thank you, Aki! Happy Friday everyone! 😀', 'Why so hot, Nicomaine? 😍\n\n© mainedcm | IGS\n#ADNTimeless2017 https://t.co/9MH5mIgJZl', '@PawanKalyan #Kalyan babu launched d T of #2Countries at #PSPK25 Sets 😍\nFight sequence na endi samee shoot,pillodiki chematalu padday 🙏🙏\n@mee_sunil https://t.co/wTXIC7Kyh3', 'workin on me :) https://t.co/kjdytM1WSt']
Unhappy: 21034
["Likey is ranked 3rd on Music Bank this week!\n\ni can't decide between 😢 or 🎉 tbh https://t.co/HU8AcoGB8j", 'I just checked the headphone I bought in traffic, somewhere around Oshodi last night . They wrote Beats by Dele Ladipo. But why?😢', 'but we don’t get to talk all that much so :(( https://t.co/9OZAvTwToR', 'Need to stop looking at the sales :(', 'GAMEDAY! The 2017 season comes to a close today when we host @American_VBall rival Cincinnati at 1 p.m. Help 

# Step 2: Preprocessing
 
 * Get rid of the emojis
 * Get rid of line breaks (in case a tweet has multiple lines)

In [37]:
processed_happy = []
for tweet in happy_tweets: # use the happy emoji regular expression to remove all occurances of those emojis
    text = re.sub(happy_regex, " ", tweet)
    processed_happy.append(" ".join(text.split())) # normalize spaces

processed_unhappy = []
for tweet in unhappy_tweets:
    text = re.sub(unhappy_regex, " ", tweet)
    processed_unhappy.append(" ".join(text.split()))
    
processed_happy = processed_happy[:min(len(processed_happy)-1,len(processed_unhappy))] # balance datasets
    
print("Happy:", len(processed_happy))
print(processed_happy[:5])

print("Unhappy:", len(processed_unhappy))
print(processed_unhappy[:5])

Happy: 21034
['RED VELVET SEASON GREETING PREVIEW https://t.co/4K6n1CnUik', '@fukayaqui @DonnaFins @redsand2 @SewingAngela @rosevine3 @kigi_ebooks @awlasky @mank56 Thank you, Aki! Happy Friday everyone!', 'Why so hot, Nicomaine? © mainedcm | IGS #ADNTimeless2017 https://t.co/9MH5mIgJZl', '@PawanKalyan #Kalyan babu launched d T of #2Countries at #PSPK25 Sets Fight sequence na endi samee shoot,pillodiki chematalu padday 🙏🙏 @mee_sunil https://t.co/wTXIC7Kyh3', 'workin on me https://t.co/kjdytM1WSt']
Unhappy: 21034
["Likey is ranked 3rd on Music Bank this week! i can't decide between or 🎉 tbh https://t.co/HU8AcoGB8j", 'I just checked the headphone I bought in traffic, somewhere around Oshodi last night . They wrote Beats by Dele Ladipo. But why?', 'but we don’t get to talk all that much so ( https://t.co/9OZAvTwToR', 'Need to stop looking at the sales', 'GAMEDAY! The 2017 season comes to a close today when we host @American_VBall rival Cincinnati at 1 p.m. Help us send out our seniors in s

# Step 3: Train a classifier

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

def tokenizer(text):
    return [word for word in re.split('([^a-zA-ZäåöÄÅÖ\:\)-])', text) if word != "" and word != " "]

# Turn the text into vectors that can be handled by the classifier
vectorizer = TfidfVectorizer(lowercase=True, binary=True, stop_words=None, tokenizer=tokenizer, min_df=20)

analyser = vectorizer.build_analyzer() # example of the vectorizer output in text format
print(analyser('Tässä twiittasin ja joku mummo tuli kysyy multa löytyykö niitä pokemoneja. :) 😃😃'))

# turn text into feature vectors (numbers)
data_matrix = vectorizer.fit_transform(processed_happy + processed_unhappy) # run vectorizer

# Give each document the target label to be predicted (1 or -1)
labels = [1]*len(processed_happy) + [-1]*len(processed_unhappy)

# Split data into train and test, keep random 10% of the data aside for testing
data_train, data_test, labels_train, labels_test = train_test_split(data_matrix, labels, test_size=0.1, random_state=0)

# Train the classifier on our data
# C is an important parameter: the smaller, the fewer features will be used
classifier=LinearSVC(C=0.05 ,dual=False, penalty='l1', max_iter=100000)
classifier.fit(data_train, labels_train)

# And test its accuracy on test data
print("Accuracy=", classifier.score(data_test, labels_test)*100.0)
print("Majority baseline=",max(len(processed_happy), \
                               len(processed_unhappy))/(len(processed_happy) + len(processed_unhappy))*100)



['tässä', 'twiittasin', 'ja', 'joku', 'mummo', 'tuli', 'kysyy', 'multa', 'löytyykö', 'niitä', 'pokemoneja', '.', ':)', '😃', '😃']
Accuracy= 80.79391490373187
Majority baseline= 50.0


# Step 4: Pick the classifier's brain
 
 * List features with extremely high (associated with positive labels) or extremely low (associated with negative labels) weights

In [39]:
# Print sorted by weight
f_names = vectorizer.get_feature_names() # feature names i.e. words 
sorted_by_weight = sorted(zip(classifier.coef_[0], f_names))

# Fourty lowest
print("Unhappy features:")
for f_weight,f_name in sorted_by_weight[:40]:
    print(f_name, f_weight)
print("------------------------")
# Fourty highest
print("Happy features:")
for f_weight,f_name in sorted(sorted_by_weight[-40:],reverse=True):
    print(f_name, f_weight)

Unhappy features:
️ -3.213200812327916
💔 -2.7936901473929105
😔 -2.52605848507915
ツ -2.421889597156168
sad -2.2633317611939905
( -2.1081233179747865
🏆 -1.8224674494627895
cry -1.7055506360336716
miss -1.645874919653571
rest -1.4808243871634572
trump -1.4134451500518703
not -1.3340863069700946
poor -1.3304624810619265
missing -1.296803639209172
crying -1.2233118530402833
ak- -1.2228342492992075
wtf -1.1891008991962468
😤 -1.1768903694456319
heart -1.1189842862684827
broke -1.1025202732437267
lost -1.0931192369705716
rip -1.0773443575745214
sorry -1.0677655506865638
was -1.0635753974082085
pls -1.0497851221321184
but -1.0228162639971374
👎 -1.0122700587900884
he -1.0085128526804634
xwjhh -0.9932680750795008
been -0.9832236146630627
i -0.9557923884838099
hate -0.9170271016052908
tears -0.9029342551578683
why -0.8910304538107255
no -0.8599303259820026
libya -0.8292131128617948
please -0.8214080684553509
help -0.8205372553209255
what -0.8083474355563263
still -0.7965884754441235
--------------