In [1]:
import pandas as pd
import numpy as np

import re
import string
from langdetect import detect_langs
from langdetect import DetectorFactory
DetectorFactory.seed = 12

import emoji
import nltk
from nltk.tokenize import TweetTokenizer 
#from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
df = pd.read_csv('../data/tweets.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null int64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB


In [4]:
df.drop(['airline_sentiment_gold', 'negativereason_gold',
         'tweet_coord', 'tweet_location'],
        axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_created,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,Pacific Time (US & Canada)


In [9]:
def extract_mentions(tweet):
    return re.findall("@([a-zA-Z0-9]{1,15})", tweet)


def extract_hashtags(tweet):
    return set(part[1:] for part in tweet.split() if part.startswith('#'))


def preprocess(tweet):
    no_mentions = re.sub("@([a-zA-Z0-9]{1,15})", '', tweet)
    lower = no_mentions.lower()
    no_urls = re.sub(r'http\S+', '', lower)
    no_num = re.sub(r'\d+', '', no_urls)
    emoji_to_word = emoji.demojize(no_num).replace(':',' ').replace('  ', ' ')

    emoticon_pattern = '[:;]{1}-?[dDpPsS)(]+'
    emoticons = re.findall(emoticon_pattern, emoji_to_word)
    all_words = []
    for word in no_num.split():
        if word in emoticons:
            all_words.append(word)
        else:
            #word = word.translate(str.maketrans("", "", string.punctuation))
            word = re.sub('[^A-Za-z0-9]+', '', word)
            word = re.sub('…', ' ', word)
            all_words.append(word)
    no_punc = ' '.join(all_words)

    no_whitespace = no_punc.strip()
    
    tweet_tokenizer = TweetTokenizer()
    tokens = tweet_tokenizer.tokenize(no_whitespace)
    #lemmatizer = WordNetLemmatizer()
    #lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    #no_stop = [lemma for lemma in lemmas if lemma not in ENGLISH_STOP_WORDS]
    no_stop = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
    stemmer = PorterStemmer()
    stems = [stemmer.stem(tweet) for tweet in no_stop]

    return stems


df['cleaned_tokens'] = df.text.apply(preprocess)

In [10]:
# Drop duplicate tweets (retweets)
df = df.drop_duplicates(subset='text')

In [11]:
def find_lang(tweet):
    langs = detect_langs(tweet)
    return langs[0].lang


df['language'] = df.text.apply(find_lang)

In [12]:
df.language.value_counts()[:5]

en    14228
fr       69
da       33
af       29
cy       11
Name: language, dtype: int64

In [13]:
non_en = df[df.language != 'en'].cleaned_tokens

non_en_tokens = []
for tweet in non_en:
    non_en_tokens.extend(tweet)

In [14]:
nltk.FreqDist(non_en_tokens).most_common(20)

[('fleet', 50),
 ('fleek', 50),
 ('just', 11),
 ('like', 8),
 ('rt', 8),
 ('guy', 7),
 ('did', 6),
 ('dont', 6),
 ('love', 6),
 ('jetblu', 5),
 ('broken', 4),
 ('luggag', 4),
 ('bag', 4),
 ('dmd', 4),
 ('your', 4),
 ('im', 4),
 ('u', 4),
 ('doesnt', 4),
 ('sent', 4),
 ('look', 3)]

The most common token for "non-English" tweets are all in English. Safe to say most, if not all, of our data set is in English.

In [15]:
df['cleaned_tweet'] = df.cleaned_tokens.apply(lambda x: ' '.join(x))

df['length'] = df.text.apply(lambda tweet: len(tweet))
df['capitals'] = df.text.apply(lambda tweet: sum(1 for letter in tweet if letter.isupper()))
df['cap_length_ratio'] = df.capitals / df.length
df['n_words'] = df.text.apply(lambda tweet: len(tweet.split()))
df['n_happy'] = df.text.apply(lambda tweet: sum(tweet.count(w) for w in [':-)', ':)', ';-)', ';)', ':-D', ':D']))
df['n_sad'] = df.text.apply(lambda tweet: sum(tweet.count(w) for w in (':-<', ':<', ':-(', ':(', ';-(', ';(')))
df['n_exclamations'] = df.text.apply(lambda tweet: tweet.count('!'))
df['n_questions'] = df.text.apply(lambda tweet: tweet.count('?'))

In [16]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_created,...,cleaned_tokens,cleaned_tweet,length,capitals,cap_length_ratio,n_words,n_happy,n_sad,n_exclamations,n_questions
0,570306133677760513,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,...,[said],said,35,3,0.085714,4,0,0,0,0
1,570301130888122368,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,...,"[plu, youv, ad, commerci, experi, tacki]",plu youv ad commerci experi tacki,72,2,0.027778,9,0,0,0,0
2,570301083672813571,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,...,"[didnt, today, mean, need, trip]",didnt today mean need trip,71,5,0.070423,12,0,0,1,0
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,...,"[realli, aggress, blast, obnoxi, entertain, gu...",realli aggress blast obnoxi entertain guest fa...,126,2,0.015873,17,0,0,0,0
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,...,"[realli, big, bad, thing]",realli big bad thing,55,2,0.036364,10,0,0,0,0


In [17]:
df.to_csv('../data/cleaned_tweets.csv', index=False)