# Simple Text Classification (TFIDF + LinearSVC)

In this notebook, a simple text classification algorithm is developed for the Classification of Disaster Tweets. 

Steps:
- Read Data.
- Data Preprocessing
- Modelling with TF-IDF and LinearSVC

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
df_train.info()
print('_'*40)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [5]:
#Save the 'Id' column
train_ID = df_train['id']
test_ID = df_test['id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("id", axis = 1, inplace = True)
df_test.drop("id", axis = 1, inplace = True)

## Preprocessing

In [6]:
# Check for NaN values:
print(df_train.isnull().sum())
df_test.isnull().sum()


keyword       61
location    2533
text           0
target         0
dtype: int64


keyword       26
location    1105
text           0
dtype: int64

## Missing Values

### Keyword

Replace NaN values by None

In [7]:
df_train['keyword'] = df_train['keyword'].fillna('None')
df_test['keyword'] = df_test['keyword'].fillna('None')

### Location


In [8]:
df_train['location'] = df_train['location'].fillna('None')
df_test['location'] = df_test['location'].fillna('None')

In [9]:
df_train["text"]= df_train["keyword"] + " " + df_train["location"] + " "+df_train["text"]
df_test["text"]= df_test["keyword"] + " " + df_test["location"] + " "+df_test["text"]

df_train=df_train.drop("keyword",axis=1)
df_train=df_train.drop("location",axis=1)

df_test=df_test.drop("keyword",axis=1)
df_test=df_test.drop("location",axis=1)

## Data Cleaning

In [10]:
#Remove redundant samples
df_train=df_train.drop_duplicates(subset=['text', 'target'], keep='first')

In [11]:
import re
def clean(sen):
    sentence = re.sub("http[s]*://[^\s]+"," ",sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    sentence = str(sentence).lower()
    sentence = sentence.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    sentence = re.sub(r"([0-9]+)000000", r"\1m", sentence)
    sentence = re.sub(r"([0-9]+)000", r"\1k", sentence)
    sentence = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s\/]*))*", "", sentence)
    sentence = sentence.replace("_", " ")
    
    return sentence

from nltk.stem.porter import PorterStemmer
import string
from nltk.tokenize import word_tokenize

pstem = PorterStemmer()
def clean_text(text):
    text= text.lower()
    text= re.sub('[0-9]', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens=[pstem.stem(word) for word in tokens]
    #tokens=[word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(tokens)
    return text

In [12]:
df_train['text'] = df_train['text'].apply(lambda s : clean(s))
df_test['text'] = df_test['text'].apply(lambda s : clean(s))
df_train['text'] = df_train['text'].apply(lambda s : clean_text(s))
df_test['text'] = df_test['text'].apply(lambda s : clean_text(s))


## Modelling


In [13]:
X_train = df_train.loc[:,df_train.columns != 'target']  # this time we want to look at the text
y_train = df_train['target']

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

preprocessor = ColumnTransformer(
     transformers=[
         ('text', TfidfVectorizer(), 'text'),]
    ,)

text_clf = Pipeline([('preprocessor', preprocessor),
                     ('clf', LinearSVC(loss='hinge'),),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train) 

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text', TfidfVectorizer(),
                                                  'text')])),
                ('clf', LinearSVC(loss='hinge'))])

In [15]:
predictions = text_clf.predict(df_test)

In [16]:
output = pd.DataFrame({'id': test_ID,
                       'target': predictions})

output.to_csv('submission.csv', index=False)