In [1]:
#import packages
import pandas as pd
from nltk.tokenize import TweetTokenizer

## Data Cleaning (preliminary)

In [2]:
tweets = pd.read_pickle("../data/train.pkl")

In [3]:
#isolating relevant columns
tweets.drop(columns=["tweet_id", "text_info_conf", "text_human_conf", "image_id", "image_info", "image_info_conf", "image_human", "image_human_conf", "image_damage", "image_damage_conf", "image_url", "image_path"], inplace=True)

In [4]:
#making text_info binary

#map new values
tweets["info_binary"] = tweets.text_info.map({"informative":1,"not_informative":0, "dont_know_or_cant_judge":0})

In [6]:
tweets.info_binary.value_counts() #baseline accuracy

1    9742
0    3788
Name: info_binary, dtype: int64

In [36]:
#make tweets lowercase
lower_tweets = []
for tweet in tweets.tweet_text:
    lower_tweets.append(tweet.lower())

lower_tweets = pd.DataFrame(lower_tweets)

lower_tweets.rename(columns={0:"tweet"}, inplace=True)

tweets.tweet_text = lower_tweets.tweet

#truncate elongations (NO CODE FOR NOW)
#>>> import re
#>>> re.sub(r'(.)\1+', r'\1\1', "haaaaapppppyyy")  #https://stackoverflow.com/questions/10072744/remove-repeating-characters-from-words
#spell out numbers (NO CODE FOR NOW)

#replace all usernames with 'USERID'
tweets.tweet_text = tweets.tweet_text.replace(r'@\S+', 'USERID', regex=True)

userid_tweets = []
for tweet in tweets.tweet_text:
    userid_tweets.append(tweet.replace('USERID', ''))

userid_tweets = pd.DataFrame(lower_tweets)

userid_tweets.rename(columns={0:"tweet"}, inplace=True)
tweets.tweet_text = lower_tweets.tweet


#replace all URLs with HTTP
tweets.tweet_text = tweets.tweet_text.replace(r'http\S+', 'HTTP', regex=True).replace(r'www\S+', 'HTTP', regex=True)

http_tweets = []

for tweet in tweets.tweet_text:
    http_tweets.append(tweet.replace('HTTP', ''))

http_tweets = pd.DataFrame(http_tweets)

http_tweets.rename(columns={0:"tweet"}, inplace=True)
tweets.tweet_text = http_tweets.tweet

#remove "RT" string
tweets.tweet_text = tweets.tweet_text.map(lambda x: x.lstrip('rt'))

In [41]:
tweets.shape

(13526, 5)

In [42]:
tweets = tweets.drop_duplicates(subset=["tweet_text"])

In [43]:
tweets.to_pickle("../tweets_cleaned.pkl")

In [45]:
tweets.info_binary.value_counts(1)

1    0.75675
0    0.24325
Name: info_binary, dtype: float64

## Preprocessing (preliminary)

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
#Name variables 

X = tweets[["tweet_text"]]
y = tweets["info_binary"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

## Preliminary Model: CountVectorizer and Logistic Regression 

In [49]:
#CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
predictions = lr.predict(X_test_cvec)

In [53]:
cross_val_score(lr, X_train_cvec, y_train, cv=5).mean()

0.8109999999999999

In [51]:
lr.score(X_train_cvec, y_train)

0.9826666666666667

In [52]:
lr.score(X_test_cvec, y_test)

0.816