### Importing libraries and datasets

In [1]:
# Importing all the required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
#Importing the train and test data set
df_train = pd.read_csv('train.csv',encoding = "ISO-8859-1")
df_test = pd.read_csv('test.csv',encoding = "ISO-8859-1")

### Basic EDA

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 9 columns):
Source             1157 non-null object
Host               1098 non-null object
Link               1157 non-null object
Date(ET)           1157 non-null object
Time(ET)           1157 non-null object
time(GMT)          996 non-null object
Title              941 non-null object
TRANS_CONV_TEXT    1156 non-null object
Patient_Tag        1157 non-null int64
dtypes: int64(1), object(8)
memory usage: 81.4+ KB


In [4]:
df_train.head()

Unnamed: 0,Source,Host,Link,Date(ET),Time(ET),time(GMT),Title,TRANS_CONV_TEXT,Patient_Tag
0,FORUMS,cafepharma.com,http://cafepharma.com/boards/threads/epstein.5...,6/15/2016,13:58:00,6/15/2016 23:28,Epstein,I don't disagree with you in principle. I'm ju...,0
1,FORUMS,www.patient.co.uk,http://www.patient.co.uk/forums/discuss/enlarg...,5/7/2016,0.820833333,42498.21667,Enlarged Heart.Thread Enlarged Heart,I am always dizzy I get dizzy standing up so I...,1
2,BLOG,http://abcnewsradioonline.com/entertainment-news,http://abcnewsradioonline.com/entertainment-ne...,4/14/2016,15:00:38,4/15/2016 0:30,Queen Latifah Joins American Heart Association...,Axelle/Bauer-Griffin/FilmMagic(NEW YORK) -- Qu...,0
3,FORUMS,www.cancer-forums.net,http://www.cancer-forums.net/viewtopic.php?f=1...,6/18/2016,20:46:00,6/19/2016 6:16,Bulaemia,I am 17 and I have been throwing up for about ...,1
4,FORUMS,www.diyaudio.com,http://www.diyaudio.com/forums/lounge/292252-d...,6/15/2016,3:26:00,6/15/2016 12:56,DIY Silver interconnects and RCAs???,Quote: Originally Posted by Boyan Silyavski Wa...,0


### Data Pre-Processing

In [5]:
#There are null values present in the columns host and title. We will fill them with datas extracted from the link
#But before that we will drop the row for which the conversation is empty
df_train.drop(df_train[df_train['TRANS_CONV_TEXT'].isna()].index[0],inplace=True)
df_train.reset_index(drop=True,inplace=True)

##### Functions to clean data

In [6]:
#Functions to extract week of the day,month, hour and from date and time column
def datetime_data(df):
    #First we convert to datetime format
    df['Date(ET)'] = df['Date(ET)'].apply(lambda x: pd.to_datetime(x,errors = 'coerce',format='%m/%d/%Y'))
    df['Time(ET)'] = df['Time(ET)'].apply(lambda x: pd.to_datetime(x,errors = 'coerce'))
    df['Weekday'] = df['Date(ET)'].apply(lambda x: x.weekday())
    df['Month'] = df['Date(ET)'].apply(lambda x: x.month)
    df['Hour'] = df['Time(ET)'].apply(lambda x: x.hour)
    #Now we can drop the date(et) and time(et) columns
    df.drop(['Date(ET)','Time(ET)'],axis=1,inplace=True)
    #Now let's fill the NA values in our generated column with mode
    df['Hour'].fillna(df['Hour'].mode().iloc[0],inplace=True)
    df['Weekday'].fillna(df['Weekday'].mode().iloc[0],inplace=True)
    df['Month'].fillna(df['Month'].mode().iloc[0],inplace=True)
    #Now we convert them to string type for dummy creation
    df['Hour'] = df['Hour'].astype(str)
    df['Weekday'] = df['Weekday'].astype(str)
    df['Month'] = df['Month'].astype(str)

In [7]:
#function to extract hostname from url
def get_url(x):
    y = urlparse(x)
    return(y.netloc)

#function to extract title from url
def get_title(x):
    y = urlparse(x)
    y = y.path
    y = y.split(sep='/')
    return(y[-1])
#funtion to fill na values in Title
def fillna_title(df):
    lst = df[df['Title'].isna()].index.tolist()
    for i in lst:
        df['Title'].iloc[i] = get_title(df['Link'].iloc[i])

In [8]:
# Function to wrap above funcs and return a clean data set
def clean_data(df):
    #First we drop columns that are not needed
    df.drop(['Host','time(GMT)'],axis=1,inplace=True)
    #make a column for host
    df['Host'] = df['Link'].apply(lambda x: get_url(x))
    #Call function to fill na values in title
    fillna_title(df)
    #Process date time data
    datetime_data(df)

In [9]:
#Then we call the function to clean_data
clean_data(df_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 9 columns):
Source             1156 non-null object
Link               1156 non-null object
Title              1156 non-null object
TRANS_CONV_TEXT    1156 non-null object
Patient_Tag        1156 non-null int64
Host               1156 non-null object
Weekday            1156 non-null object
Month              1156 non-null object
Hour               1156 non-null object
dtypes: int64(1), object(8)
memory usage: 81.4+ KB


##### Function to pre process for NLP

In [11]:
wordnet = WordNetLemmatizer()
tfidf = TfidfVectorizer(max_features=13000)
stopwords = stopwords.words('english')
stopwords.extend(['www','co','com','org','net','uk','blogspot','info','de'])
def pre_process(msg,corpus):
    msg = msg.lower()
    msg = re.sub('[^a-zA-Z]',' ', msg)
    msg = nltk.word_tokenize(msg)
    msg = [wordnet.lemmatize(word) for word in msg if word not in stopwords]
    msg = ' '.join(msg)
    corpus.append(msg)

In [12]:
#Function to pre-process
def pre_process_train(df):
    train = pd.DataFrame()
    train['Text'] = df[['Host','Title','TRANS_CONV_TEXT']].apply(lambda x: ' '.join(x),axis=1)
    #train = pd.concat([train1,df[['Source']]],axis=1)
    #del train1
    
    corpus = []
    train['Text'].apply(lambda x : pre_process(x,corpus))
    
    tfidf.fit(corpus)
    x = tfidf.transform(corpus).toarray()
    x1 = pd.DataFrame(x)
    
    #splitting the categorical variables and numerical variables
    x_cat = df[['Source','Weekday','Month','Hour']]
    train_dums = pd.get_dummies(x_cat,drop_first=True)
    del train
    
    train = pd.concat([train_dums,x1],axis=1)
    del x1, train_dums, x, x_cat
    return train

In [13]:
train = pre_process_train(df_train)

In [14]:
train.head()

Unnamed: 0,Source_FACEBOOK,Source_FORUMS,Source_Facebook,Source_YOUTUBE,Weekday_1.0,Weekday_2.0,Weekday_3.0,Weekday_4.0,Weekday_5.0,Weekday_6.0,...,12990,12991,12992,12993,12994,12995,12996,12997,12998,12999
0,0,1,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
target = df_train[['Patient_Tag']]

In [16]:
#Now we define the function to pre-process test data
def pre_process_test(df):
    train = pd.DataFrame()
    train['Text'] = df[['Host','Title','TRANS_CONV_TEXT']].apply(lambda x: ' '.join(x),axis=1)
    #train = pd.concat([train1,df[['Source']]],axis=1)
    #del train1
    
    corpus = []
    train['Text'].apply(lambda x : pre_process(x,corpus))
    
    #tfidf.fit(corpus)
    x = tfidf.transform(corpus).toarray()
    x1 = pd.DataFrame(x)
    
    #splitting the categorical variables and numerical variables
    x_cat = df[['Source','Weekday','Month','Hour']]
    train_dums = pd.get_dummies(x_cat,drop_first=True)
    del train
    
    train = pd.concat([train_dums,x1],axis=1)
    del x1, train_dums, x, x_cat
    return train

In [17]:
target['Patient_Tag'].value_counts()

0    916
1    240
Name: Patient_Tag, dtype: int64

In [18]:
#Above code shows that our data is imbalanced, so we apply synthetic oversampling
sm = SMOTE(random_state = 1)
train, target = sm.fit_sample(train, target)

  y = column_or_1d(y, warn=True)


### Training the Model

In [19]:
x_train,x_test,y_train,y_test = train_test_split(train,target,test_size=.30,random_state=1)
print("x_train shape:",x_train.shape)
print("y_train shape:",y_train.shape)
print("x_test shape:",x_test.shape)
print("y_test shape:",y_test.shape)

x_train shape: (1282, 13038)
y_train shape: (1282,)
x_test shape: (550, 13038)
y_test shape: (550,)


In [20]:
rfc = RandomForestClassifier()
grid_params = {'n_estimators': [200,500,700,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']}
rfc1 = GridSearchCV(rfc,grid_params,cv = 10)
rfc1.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [21]:
rfc1.best_params_

{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 1000}

In [22]:
y_pred = rfc1.predict(x_test)

In [23]:
#confusion matrix
cnf2 = confusion_matrix(y_test,y_pred)
cnf2

array([[271,   8],
       [  9, 262]], dtype=int64)

In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       279
           1       0.97      0.97      0.97       271

    accuracy                           0.97       550
   macro avg       0.97      0.97      0.97       550
weighted avg       0.97      0.97      0.97       550



### Now we predict on test data

In [25]:
def predict(df,predictor):
    #first we clean the test data
    clean_data(df)
    df.drop('Unnamed: 9',axis=1,inplace=True)
    #Now we pre-process the test set
    test = pre_process_test(df)
    #Now we predict
    test_pred = predictor.predict(test)
    preds = pd.DataFrame(test_pred,columns=['Patient_Tag'])
    preds.reset_index(inplace=True)
    preds.rename(columns={'index':'Index'},inplace=True)
    preds['Index'] = preds['Index']+1
    preds.head()
    return preds

In [26]:
preds = predict(df_test,rfc1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [28]:
preds.to_csv('Submission4.csv',index=False)