In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

## Overview of Data

In [2]:
df = pd.read_csv('../Data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


## Split into train and test groups

In [4]:
X = df[['keyword', 'location', 'text']]
y = df['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18)

In [6]:
X_train

Unnamed: 0,keyword,location,text
355,army,,Vote for #Directioners vs #Queens in the 5th r...
1570,cliff%20fall,,#FunnyNews #Business Watch the moment a cliff ...
7198,weapon,,Back to back like I'm on the cover of lethal w...
2614,destruction,,Crackdown 3 Destruction Restricted to Multipla...
6977,twister,,It's alil twister at Tha end to! I was like oh...
...,...,...,...
1726,collided,"Johannesburg, South Africa",2 pple have been confirmed dead and over 20 re...
2885,drought,"Los Angeles, CA",'It's an eerie way of revealing both our histo...
1144,bombing,,Japan Marks 70th Anniversary of Hiroshima Atom...
4371,hijacker,,Medieval airplane hijacker testa: earnings the...


### Remove Stopwords & Tokenize Text

In [7]:
#Create List of stopwords & punctuation
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [8]:
def process_tweet(tweet_text):
    
    #Remove url links from text
    tweet_text = re.sub(r"http\S+", "", tweet_text)
    
    #Tokenize text using NLTK function
    tokens = nltk.word_tokenize(tweet_text)
    
    #Make all words lowercase and remove words in stopwords_list
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    return stopwords_removed        

In [9]:
X_train_processed = list(map(process_tweet, X_train['text']))

### EDA - Frequency Distribution

Find total unique words in the dataset.

In [10]:
#Use a set so that no duplicate words are counted
total_vocab = set()
for text in X_train_processed:
    total_vocab.update(text)
len(total_vocab)

15132

In [11]:
articles_concat = []
for article in X_train_processed:
    articles_concat += article

articles_freqdist = FreqDist(articles_concat)
articles_freqdist.most_common(50)

[("'s", 605),
 ("n't", 342),
 ('like', 263),
 ('amp', 259),
 ("'m", 184),
 ('fire', 181),
 ('get', 166),
 ('via', 163),
 ('new', 160),
 ('news', 153),
 ('people', 144),
 ('one', 140),
 ('video', 130),
 ('disaster', 119),
 ('2', 118),
 ('emergency', 115),
 ('would', 106),
 ('police', 103),
 ("'re", 101),
 ('still', 95),
 ('man', 93),
 ('body', 92),
 ('back', 91),
 ('..', 91),
 ('going', 91),
 ('crash', 91),
 ('got', 90),
 ('storm', 89),
 ('day', 88),
 ('us', 88),
 ('california', 84),
 ('burning', 84),
 ('know', 81),
 ('suicide', 79),
 ('time', 79),
 ('two', 78),
 ('today', 78),
 ('buildings', 78),
 ('ca', 78),
 ('youtube', 78),
 ('see', 77),
 ('love', 76),
 ('first', 76),
 ('world', 75),
 ('killed', 75),
 ('families', 75),
 ('fires', 74),
 ('rt', 74),
 ('nuclear', 74),
 ('attack', 74)]

### Vectorize with TF-IDF

The TfidfVectorizer() function takes in whole blocks of text, not individual words.  Therefore, we remove stopwords from the original text stored in our X_train dataframe.

In [12]:
def remove_links(tweet_text):
    #Remove url links from text
    tweet_text = re.sub(r"http\S+", "", tweet_text)
    
    return tweet_text

In [13]:
def remove_stopwords(text):
    #Create List of stopwords & punctuation
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    stopwords_list += ["''", '""', '...', '``']
    
    for word in stopwords_list:
        text = text.replace(word, '')
#     stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return text

In [14]:
vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True)

tf_idf_data_train = vectorizer.fit_transform(data)

tf_idf_data_test = vectorizer.transform(newsgroups_test.data)

NameError: name 'data' is not defined