In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier


In [2]:
df_train = pd.read_csv('../data/train.csv', dtype={'id': np.int16, 'target': np.int8})
df_test = pd.read_csv('../data/test.csv', dtype={'id': np.int16})

In [3]:
df_train.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,text_cleaned,tokens
0,0,0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this # earthquake...,our deed reason earthquake may allah forgive u
1,1,1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask . Canada,forest fire near la ronge sask canada


In [45]:
X = df_train['tokens'].values.astype('U')
y = df_train['target'].values.astype('U')
X_test = df_test['tokens'].values.astype('U')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=123)


In [25]:
X_train.shape

(7613,)

In [47]:
def text_vectorization(vectorizer, X_train, X_val):
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    return X_train, X_val

In [38]:
def predict(classifier, X_train, y_train, X_val, y_val):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_val)
    return np.mean(y_pred == y_val)

In [50]:
# test 3 vectorizers - Bag of Words, TF-IDF, Hash
# test 6 classification models
vectorizers = [CountVectorizer(), TfidfVectorizer()] #, HashingVectorizer()
models = [SVC(), LogisticRegression(), RandomForestClassifier(), ExtraTreesClassifier(), DecisionTreeClassifier(), KNeighborsClassifier()] #
for vectorizer in vectorizers:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=123)
    X_train, X_val = text_vectorization(vectorizer, X_train, X_val)

    for model in models:
        acc = predict(model, X_train, y_train, X_val, y_val)
        print(vectorizer, model, acc)

CountVectorizer() SVC() 0.8062077198567449
CountVectorizer() LogisticRegression() 0.8058097890966972
CountVectorizer() RandomForestClassifier() 0.7859132510943095
CountVectorizer() ExtraTreesClassifier() 0.7684042976522085
CountVectorizer() DecisionTreeClassifier() 0.7230401910067649
CountVectorizer() KNeighborsClassifier() 0.6975726223637088
TfidfVectorizer() SVC() 0.8050139275766016
TfidfVectorizer() LogisticRegression() 0.8089932351770792
TfidfVectorizer() RandomForestClassifier() 0.7795463589335455
TfidfVectorizer() ExtraTreesClassifier() 0.7688022284122563
TfidfVectorizer() DecisionTreeClassifier() 0.7043374452845205
TfidfVectorizer() KNeighborsClassifier() 0.7397532829287704


In [53]:
# test 3 vectorizers - Bag of Words, TF-IDF, Hash
# test 6 classification models
vectorizers = [HashingVectorizer()]
models = [SVC(), LogisticRegression()] # RandomForestClassifier(), ExtraTreesClassifier(), DecisionTreeClassifier(), KNeighborsClassifier()
for vectorizer in vectorizers:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=123)
    X_train, X_val = text_vectorization(vectorizer, X_train, X_val)

    for model in models:
        acc = predict(model, X_train, y_train, X_val, y_val)
        print(vectorizer, model, acc)

HashingVectorizer() SVC() 0.8014325507361719
HashingVectorizer() LogisticRegression() 0.8002387584560287
