# French web domain classification
## Graph and Text combination
**Team: Samuel - TuAnh - HaiYen**

**Note1: This notebook requires the file text_all_scores (as well as some other files such as X_train/test/other_text, other_hosts), which can be obtained by running the notebook TEXT_FINAL.ipynb**

**Note2: As we have cleaned and rerun the file, the obtained results may not be as well as the final result on Kaggle.**

## Load data

In [1]:
import os
import codecs
from os import path
import csv
import networkx as nx
import numpy as np

In [2]:
path_data = "D:\\Tu Beo\\Education\\AlteGrad19\\Data Challenge\\data\\data\\"

### Read train and test hosts

In [3]:
# Read training data
with open(path_data+"train.csv", 'r') as f:
    train_data = f.read().splitlines()
    
train_hosts = list()
y_train = list()
for row in train_data:
    host, label = row.split(",")
    train_hosts.append(host)
    y_train.append(label.lower())

In [4]:
# Read test data
with open(path_data+"test.csv", 'r') as f:
    test_hosts = f.read().splitlines()

### Read prediction scores

In [5]:
logits_dict = {}
with open(path_data+'text_all_scores.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0 and len(row) > 0:
            logits_dict[row[0]] = row[1:]
        line_count += 1
    print(f'Processed {line_count} lines and found {len(logits_dict)} hosts.')

Processed 56270 lines and found 28003 hosts.


### Read Graph

In [6]:
# Create a directed, weighted graph
G = nx.read_weighted_edgelist(path_data+'edgelist.txt', create_using=nx.DiGraph())

print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())

Number of nodes:  28002
Number of edges:  319498


## Compute the average scores of neighbors

In [7]:
def get_avg_logits(hosts, G=G, num_classes=8, logits_dict=logits_dict):
    succ_logits = []
    pred_logits = []
    for host in hosts:
        sum_logits = np.zeros(num_classes)
        for node in G.successors(host):
            sum_logits += np.array(logits_dict[node], dtype='float64')*G.edges[host,node]['weight']
        if np.sum(sum_logits) == 0:
            sum_logits = np.ones(num_classes)/num_classes
        else:
            sum_logits = sum_logits/np.sum(sum_logits)
        succ_logits.append(sum_logits)

        sum_logits = np.zeros(num_classes)
        for node in G.predecessors(host):
            sum_logits += np.array(logits_dict[node], dtype='float64')*G.edges[node, host]['weight']
        if np.sum(sum_logits) == 0:
            sum_logits = np.ones(num_classes)/num_classes
        else:
            sum_logits = sum_logits/np.sum(sum_logits)
        pred_logits.append(sum_logits)
    return succ_logits, pred_logits

In [8]:
train_succ_logits, train_pred_logits = get_avg_logits(train_hosts)
X_train_logits = np.concatenate((train_succ_logits, train_pred_logits), axis=1)
X_train_logits.shape

(2125, 16)

## Get the tf-idf vectors of the nodes

**Note: Uncomment and run the following block only if you did not save the matrices in the TEXT part, otherwise load the saved matrix in the block after**

In [9]:
# %%time
# # Load and preprocess text
# text = dict()
# filenames = os.listdir(path_data+'text/text')
# for filename in filenames:
#     with codecs.open(path.join(path_data+'text/text/', filename), encoding='latin-1') as f: 
#         text[filename] = f.read().replace("\n", "").lower()
        
# from nltk.corpus import stopwords 
# import nltk
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english')).union(set(stopwords.words('french'))) 

# text_processed = {}
# for j,host in enumerate(text.keys()):
#     doc = text[host].split()
#     for i in range(len(doc)) :
#         doc[i] = doc[i].lstrip().rstrip()
#         doc[i] = re.sub(r"[\^\$\-()\"#/@;:<>{}`+=~|\]\[._\\!?,%&*]", "", doc[i])
#         if doc[i] in stop_words or len(doc[i]) < 3:
#             doc[i] = ''
#     text_processed[host] = ' '.join(doc)

#     if j % 500 == 0:
#         print(str(j)+"/"+str(len(text.keys())), end="\r")
        
# # Get textual content of web hosts
# train_text = list()
# for host in train_hosts:
#     if host in text_processed:
#         train_text.append(text_processed[host])
#     else:
#         train_text.append('')
# test_text = list()
# for host in test_hosts:
#     if host in text_processed:
#         test_text.append(text_processed[host])
#     else:
#         test_text.append('')
# other_hosts = list(set(text_processed.keys()).difference(set(train_hosts).union(test_hosts)))
# other_data = list()
# for host in other_hosts:
#     if host in text_processed:
#         other_data.append(text_processed[host])
#     else:
#         other_data.append('')
        
# # Compute tfidf vector
# from sklearn.feature_extraction.text import TfidfVectorizer
# vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='latin-1', min_df=50, max_df=2000)
# X_train_text = vec.fit_transform(train_text)

# X_test_text = vec.transform(test_text)
# X_other_text = vec.transform(other_data)

# print("Train matrix dimensionality: ", X_train_text.shape)
# print("Test matrix dimensionality: ", X_test_text.shape)
# print("Other matrix dimensionality: ", X_other_text.shape)

In [10]:
from scipy import sparse

X_train_text = sparse.load_npz(path_data+"X_train_text.npz")
X_test_text = sparse.load_npz(path_data+"X_test_text.npz")
X_other_text = sparse.load_npz(path_data+"X_other_text.npz")
with open(path_data+"other_host.csv", 'r') as f:
    other_hosts = f.read().splitlines()

In [11]:
# Remove hosts that are not in the graph
other_taken_idx = []
nodes_set = set(G.nodes())
for i, node in enumerate(other_hosts):
    if node in nodes_set:
        other_taken_idx.append(i)
X_other_text = X_other_text[other_taken_idx]
other_hosts = np.array(other_hosts)[other_taken_idx]

In [12]:
from sklearn import preprocessing
X_train_text_normalized = preprocessing.normalize(X_train_text, norm='max', axis=1)

## Dimensionality Reduction

In [13]:
from sklearn.decomposition import TruncatedSVD

In [14]:
%%time
tsvd = TruncatedSVD(n_components=120)
X_train_text_PCA = tsvd.fit_transform(X_train_text_normalized)
X_train_text_PCA.shape

Wall time: 1.2 s


## Combine and train

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score

In [16]:
X_train_comb = np.concatenate((X_train_text_PCA, X_train_logits), axis = 1)
X_train_comb.shape

(2125, 136)

In [17]:
X_train_comb_normalized = preprocessing.normalize(X_train_comb, norm='max', axis=1)

In [18]:
# Train/validation split
X_train_small, X_val, y_train_small, y_val = train_test_split(X_train_comb_normalized, y_train, test_size=0.1, random_state=42)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_small, y_train_small)

y_pred = clf.predict_proba(X_train_small)
print("TRAIN loss:", log_loss(y_train_small, y_pred))

y_pred_label = clf.predict(X_train_small)
print("TRAIN accuracy:", accuracy_score(y_train_small, y_pred_label))

y_pred = clf.predict_proba(X_val)
print("VAL loss:", log_loss(y_val, y_pred))

y_pred_label = clf.predict(X_val)
print("VAL accuracy:", accuracy_score(y_val, y_pred_label))

TRAIN loss: 0.9039830551271831
TRAIN accuracy: 0.6725941422594143
VAL loss: 1.153610001311298
VAL accuracy: 0.5633802816901409


## Output all the results (for next iteration)

In [19]:
other_succ_logits, other_pred_logits = get_avg_logits(other_hosts)
test_succ_logits, test_pred_logits = get_avg_logits(test_hosts)
X_other_logits = np.concatenate((other_succ_logits, other_pred_logits), axis=1)
X_test_logits = np.concatenate((test_succ_logits, test_pred_logits), axis=1)

In [20]:
X_test_text_normalized = preprocessing.normalize(X_test_text, norm='max', axis=1)
X_other_text_normalized = preprocessing.normalize(X_other_text, norm='max', axis=1)

In [21]:
X_test_text_PCA = tsvd.transform(X_test_text_normalized)
X_other_text_PCA = tsvd.transform(X_other_text_normalized)
X_test_comb_normalized =  preprocessing.normalize(np.concatenate((X_test_text_PCA, X_test_logits), axis = 1), norm='max', axis=1)
X_other_comb_normalized =  preprocessing.normalize(np.concatenate((X_other_text_PCA, X_other_logits), axis = 1), norm='max', axis=1)

In [22]:
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_comb_normalized, y_train)

y_pred = clf.predict_proba(X_train_comb_normalized)
print("TRAIN loss:", log_loss(y_train, y_pred))

y_pred_label = clf.predict(X_train_comb_normalized)
print("TRAIN accuracy:", accuracy_score(y_train, y_pred_label))

TRAIN loss: 0.9116297410661994
TRAIN accuracy: 0.6635294117647059


In [23]:
y_train_pred = clf.predict_proba(X_train_comb_normalized)
y_test_pred = clf.predict_proba(X_test_comb_normalized)
y_other_pred = clf.predict_proba(X_other_comb_normalized)

In [24]:
# Write the output to the file
with open(path_data+'textgraph_all_scores.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i, host in enumerate(train_hosts):
        lst = y_train_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)
    for i, host in enumerate(test_hosts):
        lst = y_test_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)
    for i, host in enumerate(other_hosts):
        lst = y_other_pred[i,:].tolist()
        lst.insert(0, host)
        writer.writerow(lst)

## Next iteration

In [25]:
logits_dict2 = {}
with open(path_data+'textgraph_all_scores.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0 and len(row) > 0:
            logits_dict2[row[0]] = row[1:]
        line_count += 1
    print(f'Processed {line_count} lines and found {len(logits_dict2)} hosts.')

Processed 56268 lines and found 28002 hosts.


In [26]:
train_succ_logits, train_pred_logits = get_avg_logits(train_hosts, logits_dict=logits_dict2)
X_train_logits = np.concatenate((train_succ_logits, train_pred_logits), axis=1)
X_train_comb_normalized =  preprocessing.normalize(np.concatenate((X_train_text_PCA, X_train_logits), axis = 1), norm='max', axis=1)

In [27]:
X_train_small, X_val, y_train_small, y_val = train_test_split(X_train_comb_normalized, y_train, test_size=0.1, random_state=42)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_small, y_train_small)

y_pred = clf.predict_proba(X_train_small)
print("TRAIN loss:", log_loss(y_train_small, y_pred))

y_pred_label = clf.predict(X_train_small)
print("TRAIN accuracy:", accuracy_score(y_train_small, y_pred_label))

y_pred = clf.predict_proba(X_val)
print("VAL loss:", log_loss(y_val, y_pred))

y_pred_label = clf.predict(X_val)
print("VAL accuracy:", accuracy_score(y_val, y_pred_label))

TRAIN loss: 0.89076671160758
TRAIN accuracy: 0.6767782426778243
VAL loss: 1.1625661743773452
VAL accuracy: 0.5774647887323944


## Feed Forward NN

In [28]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [29]:
label_set = set(y_train)
num_classes = len(label_set)
label_set_map = {'health/medical': 3,
 'sports': 6,
 'entertainment': 2,
 'politics/government/law': 5,
 'business/finance': 0,
 'tech/science': 7,
 'news/press': 4,
 'education/research': 1}
y_train_num = [label_set_map[label] for label in y_train]
y_train_category = to_categorical(y_train_num, num_classes)

In [30]:
reg_cst = 0.00001

model = Sequential()
model.add(Dense(128, input_dim=X_train_comb_normalized.shape[1],kernel_regularizer=regularizers.l2(reg_cst)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2)) 
model.add(Dense(64,kernel_regularizer=regularizers.l2(reg_cst)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
#model.add(Dense(32, activation='relu',kernel_regularizer=regularizers.l2(reg_cst)))
#model.add(BatchNormalization())
#model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(reg_cst)))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               17536     
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
__________

In [31]:
checkpointer = ModelCheckpoint(filepath=path_data+"weights/weights.hdf5", verbose=1, save_best_only=True, monitor='val_loss')

model.fit(X_train_comb_normalized, y_train_category, 
          batch_size=32, epochs=20, 
          validation_split=0.05, shuffle=True, 
          callbacks = [checkpointer],
          verbose = 1)

model.load_weights(path_data+"weights/weights.hdf5")

Train on 2018 samples, validate on 107 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.23694, saving model to D:\Tu Beo\Education\AlteGrad19\Data Challenge\data\data\weights/weights.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.23694 to 0.97057, saving model to D:\Tu Beo\Education\AlteGrad19\Data Challenge\data\data\weights/weights.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.97057 to 0.94002, saving model to D:\Tu Beo\Education\AlteGrad19\Data Challenge\data\data\weights/weights.hdf5
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.94002
Epoch 5/20

Epoch 00005: val_loss improved from 0.94002 to 0.87708, saving model to D:\Tu Beo\Education\AlteGrad19\Data Challenge\data\data\weights/weights.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.87708
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.87708
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.87708
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.8770

In [32]:
y_val_num = [label_set_map[label] for label in y_val]
y_pred = model.predict(X_val)
print("VAL loss:", log_loss(y_val_num, y_pred))
y_pred = model.predict_classes(X_val)
print("VAL accuracy:", accuracy_score(y_val_num, y_pred))

VAL loss: 1.0313645015894808
VAL accuracy: 0.6197183098591549


## Output for Kaggle

In [33]:
test_succ_logits, test_pred_logits = get_avg_logits(test_hosts, logits_dict=logits_dict2)
X_test_logits = np.concatenate((test_succ_logits, test_pred_logits), axis=1)
X_test_text_PCA = tsvd.transform(X_test_text)
X_test_comb_normalized =  preprocessing.normalize(np.concatenate((X_test_text_PCA, X_test_logits), axis = 1), norm='max', axis=1)

**Neural Net**

In [34]:
y_pred = model.predict(X_train_comb_normalized)
print("TRAIN loss:", log_loss(y_train_num, y_pred))

y_pred_label = model.predict_classes(X_train_comb_normalized)
print("TRAIN accuracy:", accuracy_score(y_train_num, y_pred_label))

y_pred_nn = model.predict(X_test_comb_normalized)

TRAIN loss: 1.0077274528402178
TRAIN accuracy: 0.6465882352941177


In [35]:
# Write predictions to a file
with open(path_data+'textgraph_test_NN_to_kaggle.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred_nn[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

**Logistic Regression**

In [36]:
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf.fit(X_train_comb_normalized, y_train)

y_pred = clf.predict_proba(X_train_comb_normalized)
print("TRAIN loss:", log_loss(y_train, y_pred))

y_pred_label = clf.predict(X_train_comb_normalized)
print("TRAIN accuracy:", accuracy_score(y_train, y_pred_label))

y_pred = clf.predict_proba(X_test_comb_normalized)

TRAIN loss: 0.9011271923744552
TRAIN accuracy: 0.6696470588235294


In [37]:
# Write predictions to a file
with open(path_data+'textgraph_test_logreg_to_kaggle.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)