In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [169]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [110]:
df = pd.read_csv('train.csv')

In [111]:
df.set_index('id',inplace=True)

In [112]:
df['length_of_tweet'] = df['text'].apply(lambda x: len(x))

# Filling in Keyword Nulls

In [114]:
#grab all #hashtags in a list
df.loc[df['keyword'].isna(),'hashtags'] = df['text'].apply(lambda x: [i[1:] for i in x.split() if '#' in i])

#create a set of all values in keyword column
keywords = set(df[~df.keyword.isna()].keyword.values)

In [115]:
#narrow down the list list of all hashtags in keywords
df['new_tag'] = df.hashtags.apply(lambda x: [i.lower() for i in x if i in keywords] if type(x) == list else np.nan)

#drop empty lists
df['new_tag'] = df['new_tag'].apply(lambda x: x if x != [] and x != np.nan else np.nan)

#grab the first hashtag and fill in missing values
df['new_tag'] = df['new_tag'].apply(lambda x: x[0] if type(x) == list else np.nan)
df.loc[df.keyword.isna(), 'keyword'] = df['new_tag']

In [116]:
#drop old columns
df.drop(['new_tag','hashtags'],inplace=True, axis=1)

# Removing Weird locations

In [118]:
#get rid of weird characters and blank spaces
characters= ['?','/', '#','+']
df['location'] = df.location.apply(lambda x: x if not any([char in str(x) for char in characters]) else np.nan)
df['location'] = df.location.apply(lambda x: x if type(x) != str else (x if x.strip() != '' else np.nan)) 

In [120]:
#get rid of locations with numbers
df['location'] = df.location.apply(lambda x: x if not any([i.isdigit() for i in str(x)]) else np.nan)

In [122]:
#get rid of locations with 4 or more words
df['location'] = df.location.apply(lambda x: x if type(x) != str else (x if len(x.split()) < 4 else np.nan))

In [126]:
bad_abbrev_and_words = ['Earth','Worldwide','Everywhere','Reddit','World','Global','ava','EIC','HTX', 'ATX','atx','PDX','MNL','CLT', 'NBO', 'AEP','mnl', 'ayr', 'GCC', 'Htx','wny', 'VCU', 'Orm', 'DMV','Ktx',]
df['location'] = df.location.apply(lambda x: x if x not in bad_abbrev_and_words else np.nan)


In [128]:
df.loc[df.groupby('location').location.transform('count') == 1,'location'] = np.nan

In [143]:
df.isna().sum()

keyword              57
location           5091
text                  0
target                0
length_of_tweet       0
dtype: int64

In [64]:
#map america and US to USA
#map nyc to NY On
#BC to britich columbia

# Filling in Location Nulls

In [130]:
#split into two differnt columns for upper case abbreviations and lower case for words
df['new_loc_lower'] = df.text.apply(lambda x: [i.lower().strip() for i in x.split() if len(i) > 2])
df['new_loc_upper'] = df.text.apply(lambda x: [i.upper().strip() for i in x.split() if len(i) <= 2])

In [131]:
#remove all blank strings from both
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if '' not in x else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if '' not in x else np.nan)

#remove all empty lists from both
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if x != [] else np.nan)

In [132]:
#create a unique list of locations for both upper and lower
x = list(df.location.unique())[1:]
key_down = [i.lower() for i in x if len(i) > 2]
key_up = [i.upper() for i in x if len(i) <= 2]

In [133]:
#remove all words that arnt in either key_up or key_down
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: [i.lower() for i in x if i.lower() in key_down] if type(x) == list else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: [i.upper() for i in x if i.upper() in key_up] if type(x) == list else np.nan)


In [134]:
#remove common 2 letter words from initials
bad_initials = ['IN','ON', 'OK']
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: [i for i in x if i not in bad_initials] if type(x) == list else np.nan)

#remove empty lists and grab the first element in each list
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if x != [] else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x[0] if type(x) == list else x)

In [135]:
#remove empty lists
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)

#if only element in list grab that element
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != list else (x[0] if len(x) == 1 else x))

#if element repeats in list grab that element
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != list else (x[0] if x[0] in x[1:] else x))

In [136]:
bad_places = ['west','east','north','south', 'mass', 'hell','heaven', 'global','world', 'unknown', 'earth', 'ebola', 'nowhere','studio', 'lincoln']

#remove all words in bad_places for lists
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: [i for i in x if i not in bad_places] if type(x) == list else x)

#remove all words in bad_places for strings
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != str else(x if x not in bad_places else np.nan))

#set empty lists to NaN and grab the first element in each list
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x[0] if type(x) == list else x)

In [137]:
#set 2 digit location first since more common, then words
df.loc[df['location'].isna(),'location'] = df.loc[df['location'].isna(),'new_loc_upper']
df.loc[df['location'].isna(),'location'] = df.loc[df['location'].isna(),'new_loc_lower']

In [138]:
df.drop(['new_loc_lower','new_loc_upper'],inplace=True,axis=1)

In [139]:
df.loc[df.groupby('location').location.transform('count') == 1,:] = 'Missing'

In [140]:
#taking the second location in each location with two specified
df['location'] = df['location'].apply(lambda x: x if type(x) != str else x.split(', ')[1] if ', ' in x else x)

In [141]:
#removing .
df['location'] = df['location'].apply(lambda x: x if type(x) != str else  ''.join(x.split('.')) if '.' in x else x)

#capitalize 
df['location'] = df['location'].apply(lambda x: x.title() if type(x) == str else x)

In [142]:
df.loc[df.groupby('location').location.transform('count') < 11,'location'] = 'ten_or_less'

In [146]:
df.location.fillna('Missing', inplace=True)
df.keyword.fillna('Missing', inplace=True)

# Tokenizing Text

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import string

In [21]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', 'http', 'https']

In [22]:
def process_tweet(tweet):
    tokenized_tweet = nltk.word_tokenize(tweet)
    clean_results = [w.lower() for w in tokenized_tweet if not w.lower() in stopwords_list and not 't.co/' in w.lower()]
    return clean_results

In [23]:
processed_data = list(map(process_tweet, df.text))
df['tokenized_data'] = processed_data

# Bag of words

In [145]:
X.loc[:,['keyword','location']] = X.loc[:,['keyword','location']].fillna('Missing')
X.loc[:,'length_of_tweet'] = X.loc[:,'length_of_tweet'].fillna(0)

x = pd.get_dummies(X['keyword'],drop_first=True)
x2 = pd.get_dummies(X['location'],drop_first=True)

X.drop(['location', 'keyword'],axis=1,inplace=True)
X = pd.concat([X,x,x2],axis=1)

In [25]:
X = df.drop(['target'], axis=1)
y = df['target']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=2)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [31]:
tf_idf_data_train = vectorizer.fit_transform(x_train.text)

tf_idf_data_test = vectorizer.transform(x_test.text)

# RFC

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier()

In [35]:
rfc.fit(tf_idf_data_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [36]:
y_pred = rfc.predict(tf_idf_data_test)


In [40]:
accuracy_score(y_pred, y_test)

0.7413000656598818

In [152]:
f1_score(y_train_pred, y_train)

ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted', 'samples'].

In [106]:
x = d.isna().sum()
x

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [108]:
x = pd.DataFrame(x)
x.columns = ['Nulls']
x

Unnamed: 0,Nulls
id,0
keyword,61
location,2533
text,0
target,0
