In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('website_classification.csv')
orignal_df = df

orignal_df.head()

Unnamed: 0,id,website,cleaned_text,category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [3]:
df.columns

Index(['id', 'website', 'cleaned_text', 'category'], dtype='object')

In [4]:
df['category'].value_counts()

Education                          114
Business/Corporate                 109
Travel                             107
Streaming Services                 105
Sports                             104
E-Commerce                         102
Games                               98
News                                96
Health and Fitness                  96
Photography                         93
Computers and Technology            93
Food                                92
Law and Government                  84
Social Networking and Messaging     83
Forums                              16
Adult                               16
Name: category, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
X_train=X_test=y_train=y_test=pd.Series()

for lable in df['category'].unique():
    print(f'--------------[{lable}]-------------')
    temp_df=df[df['category'] == lable].loc[:,['category','cleaned_text']]
    t_X_train, t_X_test, t_y_train, t_y_test = train_test_split(temp_df['cleaned_text'], temp_df['category'], test_size=0.33, random_state=42)
    X_train =  pd.concat([X_train,t_X_train])
    X_test =  pd.concat([X_test,t_X_test])
    y_train =  pd.concat([y_train,t_y_train])
    y_test =  pd.concat([y_test,t_y_test])
    

    

--------------[Travel]-------------
--------------[Social Networking and Messaging]-------------
--------------[News]-------------
--------------[Streaming Services]-------------
--------------[Sports]-------------
--------------[Photography]-------------
--------------[Law and Government]-------------
--------------[Health and Fitness]-------------
--------------[Games]-------------
--------------[E-Commerce]-------------
--------------[Forums]-------------
--------------[Food]-------------
--------------[Education]-------------
--------------[Computers and Technology]-------------
--------------[Business/Corporate]-------------
--------------[Adult]-------------


  X_train=X_test=y_train=y_test=pd.Series()


In [6]:
X_test.count()
y_test.count()

472

In [7]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [8]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)


In [9]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

print('Confusion Matrix :- \n')

print(confusion_matrix(y_test,y_pred))

print(classification_report(y_test, y_pred,target_names=y_test.unique()))

accuracy 0.923728813559322
Confusion Matrix :- 

[[ 5  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0 31  2  0  0  0  0  0  0  1  0  0  0  0  0  2]
 [ 0  1 27  0  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 33  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 37  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 31  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  1  0  0  1  0  1  1  1  0]
 [ 0  0  1  0  0  0  0 32  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0 30  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  0 28  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0 29  0  0  1  1  0]
 [ 0  0  0  0  1  0  0  0  0  0  0 30  0  0  0  0]
 [ 0  1  3  0  0  1  0  0  0  0  0  3 20  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0 34  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0 34  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0 35]]
                                 precision    recall  f1-score   support

                         Travel       1.00      0.83      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
import requests as req
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
def predict_website(URL):
    if 'http' not in URL:
        URL = 'http://' + URL
    # proxies={'http':'socks5h://localhost:9050','https':'socks5h://localhost:9050'}
    res = req.get(URL,headers={'User-Agent':UserAgent().random})
    if res.status_code == 200:
        soup = BeautifulSoup(res.content)
        print(f'{URL} is of {sgd.predict([soup.get_text()])[0]} category.')
    else:
        print(f'Unable to get response for {URL}({res.status_code})')

In [11]:
URL = 'https://www.charusat.ac.in'
predict_website(URL)

https://www.charusat.ac.in is of Education category.


In [12]:
import joblib
joblib.dump(sgd, 'website_category_detection_model.pkl', compress=9)

['website_category_detection_model.pkl']