# Script for using the dataset
Don't forget to set up your Kaggle account and retrieve the API keys from your profile.

In [None]:
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! ls ~/.kaggle
! chmod 600 /root/.kaggle/kaggle.json
! pip install kaggle
! kaggle competitions download -c nlp-getting-started
! ls

kaggle.json
Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 61.8MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 22.6MB/s]
Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 60.0MB/s]
drive  kaggle.json  sample_data  sample_submission.csv	test.csv  train.csv


# The Code

## Initialization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [None]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
print(df.info())
print('-'*50)
print(df.head())
print(test.info())
print('-'*50)
print(test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
--------------------------------------------------
   id keyword  ...                                               text target
0   1     NaN  ...  Our Deeds are the Reason of this #earthquake M...      1
1   4     NaN  ...             Forest fire near La Ronge Sask. Canada      1
2   5     NaN  ...  All residents asked to 'shelter in place' are ...      1
3   6     NaN  ...  13,000 people receive #wildfires evacuation or...      1
4   7     NaN  ...  Just got sent this photo from Ruby #Alaska as ...      1

[5 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'

In [None]:
df = df[['id', 'text', 'target']]
test = test [['id', 'text']]

## Text Preprocessing
### TextBlob, NLTK, and Regex are all you need.

In [None]:
# # In case of import errors
# ! pip install nltk
# ! pip install textblob

import re
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# # In case of any corpus are missing 
# download all-nltk
nltk.download()

stop_words = stopwords.words("english")
wordnet = WordNetLemmatizer()
def text_preproc(x):
  x = x.lower()
  # x = ' '.join(wordnet.lemmatize(word, 'v') for word in x.split())
  x = ' '.join([word for word in x.split(' ') if word not in stop_words])
  x = x.encode('ascii', 'ignore').decode()
  x = re.sub(r'https*\S+', ' ', x)
  x = re.sub(r'@\S+', ' ', x)
  x = re.sub(r'#\S+', ' ', x)
  x = re.sub(r'\'\w+', '', x)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
  x = re.sub(r'\w*\d+\w*', '', x)
  x = re.sub(r'\s{2,}', ' ', x)
  return x

df['clean_text'] = df.text.apply(text_preproc)
test['clean_text'] = test.text.apply(text_preproc)
print(df.head())
print(test.head())

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q
   id  ...                                         clean_text
0   1  ...                  deeds reason may allah forgive us
1   4  ...              forest fire near la ronge sask canada
2   5  ...  residents ask shelter place notify officer eva...
3   6  ...         people receive evacuation order california
4   7  ...              get send photo ruby smoke pour school

[5 rows x 4 columns]
   id  ...                                         clean_text
0   0  ...                          happen terrible car crash
1   2  ...           hear different cities stay safe everyone
2   3  ...  forest fire spot pond geese flee across street...
3   9  ...                                   apocalypse light
4  11  ...                 typhoon soudelor

## Text Classification using Traditional Machine Learning

### Document-Term Matrix with TF-IDF weighting 

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text']).toarray()
df_new = pd.DataFrame(X, columns=vectorizer.get_feature_names())
print(df_new.shape)

X_test = vectorizer.transform(test['clean_text']).toarray()
test_new = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())
print(test_new.shape)

(7613, 11410)
(3263, 11410)


### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [None]:
X = df_new.values
y = df.target.values
kfold = KFold(n_splits=10)
metrics = []
for train_idx, test_idx in kfold.split(X):
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(f1_score(y_test, y_pred))

0.543956043956044
0.5795454545454545
0.5394190871369294
0.6066838046272494
0.653113087674714


KeyboardInterrupt: ignored

In [None]:
clf.fit(X, y)
X_test = test_new.values
y_pred = clf.predict(X_test)

# Preparing submission
submission = pd.DataFrame()
submission['id'] = test['id']
submission['target'] = y_pred
submission.to_csv('sub_3.csv', index=False)

# Validate
submission = pd.read_csv('sub_3.csv')
print(submission.head())

! kaggle competitions submit -c nlp-getting-started -f sub_1.csv -m "First Attempt w/ TF-IDF + MultinomialNB"

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
X = df_new.values
y = df.target.values
kfold = KFold(n_splits=10)
clf = LogisticRegression()

for train_idx, test_idx in kfold.split(X):
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(f1_score(y_test, y_pred))

0.6395112016293278
0.6119096509240246
0.6427145708582834
0.6381461675579323
0.6052173913043478
0.634390651085142
0.569767441860465
0.5162790697674419
0.6825127334465195
0.7357859531772575


### Support Vector Classifiere

In [None]:
X = df_new.drop('label', axis=1).values
y = df_new.label.values
kfold = KFold(n_splits=10)
svc = SVC()

metrics = []
for train_idx, test_idx in kfold.split(X):
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  svc.fit(X_train, y_train)
  y_pred = svc.predict(X_test)
  print(f1_score(y_test, y_pred))

0.6488888888888888
0.5862068965517242
0.6189473684210527
0.6179775280898876


KeyboardInterrupt: ignored

### Naive Bayes (Score: 0.78118)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.svm import SVC
X = df_new.values
y = df.target.values
kfold = KFold(n_splits=10)
clf = MultinomialNB()
metrics = []
for train_idx, test_idx in kfold.split(X):
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(f1_score(y_test, y_pred))

0.6753731343283583
0.614481409001957
0.6748681898066783
0.6917057902973396
0.7013782542113323
0.6827794561933535
0.6506849315068493
0.6602687140115163
0.7534456355283308
0.7703703703703704


In [None]:
clf.fit(X, y)
X_test = test_new.values
y_pred = clf.predict(X_test)

# Preparing submission
submission = pd.DataFrame()
submission['id'] = test['id']
submission['target'] = y_pred
submission.to_csv('sub_2.csv', index=False)

# Validate
submission = pd.read_csv('sub_2.csv')
print(submission.head())

! kaggle competitions submit -c nlp-getting-started -f sub_1.csv -m "First Attempt w/ TF-IDF + MultinomialNB"

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


## Text Classification with LSTM