In [3]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [4]:
#Importing Data into a Pandas Data Frame
data_train = pd.read_csv("dataset/train.csv")
data_test = pd.read_csv("dataset/test.csv")

In [5]:
# A function to clean the data
stop_words = set(stopwords.words("english"))
def clean_data(text,lowercase=False,remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stop_words])
        
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])
        
    return txt

In [6]:
# Combining the Test and Train data, then cleaning the whole data (ie Description) together
data_test['Is_Response'] = np.nan
alldata = pd.concat([data_train,data_test]).reset_index(drop=True)
alldata['Description'] = alldata['Description'].map(lambda x: clean_data(x,lowercase=True,remove_stops=True,stemming = True))

In [7]:
#Calculating the Bag of Words and TDIDF Matrix
cnt_vector = CountVectorizer(analyzer = 'word',ngram_range =(1,1),min_df = 150,max_features = 500)
tf_idf = TfidfVectorizer(analyzer = 'word',ngram_range =(1,1),min_df = 150,max_features = 500)

bagofwords = cnt_vector.fit_transform(alldata['Description'])
tfidf = tf_idf.fit_transform(alldata['Description'])

In [8]:
# Creating Encoders for Categorical Variables
cols = ["Browser_Used","Device_Used"]

for i in cols:
    lbl = LabelEncoder()
    alldata[i] = lbl.fit_transform(alldata[i])
    
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidf.todense())
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [9]:
#Splitting the Bag of words and TFIDF Data Frames into seperate Test and Train data
bow_df_train = bow_df[:len(data_train)]
bow_df_test = bow_df[len(data_train):]

tfid_df_train = tfidf_df[:len(data_train)]
tfid_df_test = tfidf_df[len(data_train):]

In [10]:
#Now Splitting the Combined Data into Test and Train Data
train = alldata[~pd.isnull(alldata.Is_Response)]
test = alldata[pd.isnull(alldata.Is_Response)]

In [12]:
train["Is_Response"] = [1 if i == "happy" else 0 for i in train["Is_Response"]]

train_features1 = pd.concat([train[cols],bow_df_train],axis=1)
test_features1 = pd.concat([test[cols],bow_df_test],axis=1)

train_features2 = pd.concat([train[cols],tfid_df_train],axis=1)
test_features2 = pd.concat([test[cols],tfid_df_test],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
from sklearn.naive_bayes import GaussianNB
target = train['Is_Response']
clf = GaussianNB()
pred = clf.fit(train_features1,target).predict(test_features1)
clf2 = GaussianNB()
pred2 = clf2.fit(train_features2,target).predict(test_features2)
