# French web domain classification
## Text based classification
**Team: Samuel - TuAnh - HaiYen**

## Load Data

In [1]:
import os, codecs
from os import path
import re
import numpy as np

In [2]:
path_data = "D:\\Tu Beo\\Education\\AlteGrad19\\Data Challenge\\data\\data\\"

In [3]:
# Read training data
with open(path_data+"train.csv", 'r') as f:
    train_data = f.read().splitlines()

train_hosts = list()
y_train = list()
for row in train_data:
    host, label = row.split(",")
    train_hosts.append(host)
    y_train.append(label.lower())

# Read test data
with open(path_data+"test.csv", 'r') as f:
    test_hosts = f.read().splitlines()

In [4]:
# Load the textual content of a set of webpages for each host into the dictionary "text". 
# The encoding parameter is required since the majority of our text is french.
text = dict()
filenames = os.listdir(path_data+'text/text')
for filename in filenames:
    with codecs.open(path.join(path_data+'text/text/', filename), encoding='latin-1') as f: 
        text[filename] = f.read().replace("\n", "").lower()

## Text Preprocessing

In [5]:
from nltk.corpus import stopwords 
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english')).union(set(stopwords.words('french'))) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MyPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
%%time
text_processed = {}
for j,host in enumerate(text.keys()):
    doc = text[host].split()
    for i in range(len(doc)) :
        doc[i] = doc[i].lstrip().rstrip()
        doc[i] = re.sub(r"[\^\$\-()\"#/@;:<>{}`+=~|\]\[._\\!?,%&*]", "", doc[i])
        if doc[i] in stop_words or len(doc[i]) < 3:
            doc[i] = ''
    text_processed[host] = ' '.join(doc)

    if j % 500 == 0:
        print(str(j)+"/"+str(len(text.keys())), end="\r")

Wall time: 4min 50s


In [7]:
text_processed['1']

"alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate alternate iframe httpswwwgoogletagmanagercomnshtmlidgtm52l954 offline icon  browsing offline   functionality   limited livraison gratuite voir politique  livraisontrouvez  magasin  plus proche  homme collections  nouvelle saison  top tendance  boutique vacances  superdry sport  superdry ski  petits prix  promotion   polos  vãªtements d'extã©rieur  vestes  vestes d'hiver  coupevent  doudounes  vestes  cuir  doudounes sans manches  hauts  tshirts  polos  sweat  capuche  dã©bardeurs  chemises  tops  pulls  bas  shorts  maillots  bain  jeans  survãªtements  pantalons  accessoires  sacs  sousvãªtements  chapeaux  casquettes  dã©tente  autres accessoires  lunettes  soleil  chaussures  claquettes  tongs  baskets  bottes  chaussures  femme collections  nouvelle saiso

## Compute TF-IDF vector

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Get textual content of web hosts of the train set
train_data = list()
for host in train_hosts:
    if host in text_processed:
        train_data.append(text_processed[host])
    else:
        train_data.append('')
        
# Get textual content of web hosts of the test set
test_data = list()
for host in test_hosts:
    if host in text_processed:
        test_data.append(text_processed[host])
    else:
        test_data.append('')

In [10]:
%%time
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='latin-1', min_df=50, max_df=2000)
X_train = vec.fit_transform(train_data)

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vec.transform(test_data)

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (2125, 5512)
Test matrix dimensionality:  (560, 5512)
Wall time: 31.4 s


## Normalize the data

In [11]:
from sklearn import preprocessing
X_train_normalized = preprocessing.normalize(X_train, norm='max', axis=1)

## Train the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score

In [13]:
# Train/validation split
X_train_small, X_val, y_train_small, y_val = train_test_split(X_train_normalized, y_train, test_size=0.1, random_state=42)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_small, y_train_small)

y_pred = clf.predict_proba(X_train_small)
print("TRAIN loss:", log_loss(y_train_small, y_pred))

y_pred_label = clf.predict(X_train_small)
print("TRAIN accuracy:", accuracy_score(y_train_small, y_pred_label))

y_pred = clf.predict_proba(X_val)
print("VAL loss:", log_loss(y_val, y_pred))

y_pred_label = clf.predict(X_val)
print("VAL accuracy:", accuracy_score(y_val, y_pred_label))

TRAIN loss: 0.5153696693156739
TRAIN accuracy: 0.8885983263598326
VAL loss: 1.2280138521806483
VAL accuracy: 0.6009389671361502


## Compute the prediction scores for all the nodes in the graph

In [19]:
import csv

In [14]:
%%time
# Compute the list of the remaining nodes
other_hosts = list(set(text_processed.keys()).difference(set(train_hosts).union(test_hosts)))

# Get textual content of web hosts of the other nodes
other_data = list()
for host in other_hosts:
    if host in text_processed:
        other_data.append(text_processed[host])
    else:
        other_data.append('')
        
# Compute the tf-idf vector of the other nodes
X_other = vec.transform(other_data)
print("Other matrix dimensionality: ", X_other.shape)

Other matrix dimensionality:  (25449, 5512)
Wall time: 3min 16s


In [15]:
X_train_normalized = preprocessing.normalize(X_train, norm='max', axis=1)
X_test_normalized = preprocessing.normalize(X_test, norm='max', axis=1)
X_other_normalized = preprocessing.normalize(X_other, norm='max', axis=1)

In [16]:
# Retrain for all the train data
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_normalized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_train_pred = clf.predict_proba(X_train_normalized)
y_test_pred = clf.predict_proba(X_test_normalized)
y_other_pred = clf.predict_proba(X_other_normalized)

In [20]:
# Write the output to the file
with open(path_data+'text_all_scores.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i, host in enumerate(train_hosts):
        lst = y_train_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)
    for i, host in enumerate(test_hosts):
        lst = y_test_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)
    for i, host in enumerate(other_hosts):
        lst = y_other_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)

## Output for Kaggle

In [21]:
# Write predictions to a file
with open(path_data+'text_test_to_kaggle.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_test_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

## Save the train/test tf-idf vectors

In [23]:
from scipy import sparse

sparse.save_npz(path_data+"X_train_text.npz", X_train)
sparse.save_npz(path_data+"X_test_text.npz", X_test)
sparse.save_npz(path_data+"X_other_text.npz", X_other)

In [29]:
with open(path_data+"other_host.csv", 'w') as f:
    f.writelines('\n'.join(other_hosts))