# Code for predicting the category of website

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings

pd.set_option('display.max_rows', 10)
warnings.filterwarnings("ignore")

### Get the pickle file of the cleaned dataframe

In [2]:
df = pd.read_pickle('../GettingData/data/filtered.pkl')

### Getting corpus, and using CountVectorizer to convert it to vector form

In [3]:
corpus = df["title"]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()

### Making the target variables

In [4]:
categories = df["result"].unique()
category_dict = {value:index for index, value in enumerate(categories)}
results = df["result"].map(category_dict)

### Splitting the data for training and testing, 20 percent of the data is used for testing

In [5]:
x_train,x_test, y_train,y_test = train_test_split(X, results, test_size=0.2, random_state=1, )

### Creating model and testing the accuracy of algorithm (86 percent accurate)

In [6]:
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.86899563318777295

### Predicting test values

In [7]:
clf.predict(x_test)

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

#### Simple example showing how the algorithm is working

In [8]:
text = ["DealPly | Never miss a deal again"]
vec_text = vectorizer.transform(text).toarray()
category_dict.keys()[category_dict.values().index(clf.predict(vec_text)[0])]

u'commerce'

In [9]:
text = ["Home | National Informatics Centre"]
vec_text = vectorizer.transform(text).toarray()
category_dict.keys()[category_dict.values().index(clf.predict(vec_text)[0])]

u'other'

### Getting titles that used for prediction

In [10]:
df_p = pd.read_json('gettitle/titles.json')

### Cleaning the data

In [11]:
df_p.dropna(inplace=True)
df_p['title'] = df_p['title'].str.lstrip()
df_p['title'] = df_p['title'].str.rstrip()

### Saving the output to results.txt file inside results folder

In [12]:
d = {'commerce': True, 'other': False}
with open('../results/results.txt', 'a') as outfile:   
    for url, text in zip(df_p['url'].tolist(), df_p['title'].tolist()):
        vec_text = vectorizer.transform(text).toarray()
        outfile.write('{}\t{}\n'
                      .format(
                          url, d[category_dict.keys()[category_dict.values().index(clf.predict(vec_text)[0])]]
                      )
                     )