In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
df_train = df_train[['text', 'target']]
df_test = df_test[['text']]
df_test

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


# Text preprocessing

In [5]:
df_train.isnull().sum()

text      0
target    0
dtype: int64

In [6]:
import nltk 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_test(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    #implement stemming
    return ' '.join(words)


In [7]:
df_train['text'].apply(preprocess_test)
df_test['text'].apply(preprocess_test)

0                             happened terrible car crash
1       heard earthquake different cities stay safe ev...
2       forest fire spot pond geese fleeing across str...
3                   apocalypse lighting spokane wildfires
4                     typhoon soudelor kills china taiwan
                              ...                        
3258    earthquake safety los angeles safety fasteners...
3259    storm ri worse last hurricane city amp others ...
3260     green line derailment chicago http co utbxlcbiuy
3261    meg issues hazardous weather outlook hwo http ...
3262    cityofcalgary activated municipal emergency pl...
Name: text, Length: 3263, dtype: object

In [8]:
from sklearn.model_selection import train_test_split

test = df_test
train = df_train

X_train, X_valid, y_train, y_valid = train_test_split(train['text'], train['target'], test_size=0.2, random_state=42, stratify=train['target'])

# TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer( max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english') 

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)
X_tfidf = tfidf_vectorizer.transform(train['text'])



In [23]:
X_valid_tfidf

<1523x108829 sparse matrix of type '<class 'numpy.float64'>'
	with 21768 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

In [24]:
rf.score(X_valid_tfidf, y_valid)

0.7774130006565988

In [29]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_tfidf, train["target"], cv=5, n_jobs=-1)
scores.mean()

0.652969750725408

In [27]:
sample_submission = pd.read_csv("sample_submission.csv")