In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df.isna().sum().sort_values(ascending = False)

location    2533
keyword       61
id             0
text           0
target         0
dtype: int64

In [4]:
test_df.isna().sum().sort_values(ascending = False)

location    1105
keyword       26
id             0
text           0
dtype: int64

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report

# Fill in missing vals
for df in [train_df, test_df]:
    df['location'] = df['location'].fillna('None')
    df['keyword'] = df['keyword'].fillna('None')

X = train_df['text']
y = train_df['target']

# Split data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_i, val_i in split.split(X, y):
    X_train, X_val = X.iloc[train_i], X.iloc[val_i]
    y_train, y_val = y.iloc[train_i], y.iloc[val_i]

print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))

target
0    0.570279
1    0.429721
Name: proportion, dtype: float64
target
0    0.570584
1    0.429416
Name: proportion, dtype: float64


In [6]:
# Count, Logistic Regression
vectorizer = CountVectorizer(stop_words='english', max_features=10000)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

model = RidgeClassifier(alpha=1.0, solver='lsqr')
model.fit(X_train_vec, y_train)

# Evaluate
preds = model.predict(X_val_vec)
print(classification_report(y_val, preds))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81       869
           1       0.77      0.70      0.73       654

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [7]:
X_test_vec = vectorizer.transform(test_df['text'])
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
# Predict
sample_submission["target"] = model.predict(X_test_vec)
# submit
sample_submission.to_csv("submission.csv", index=False)