# **Importing libraries**

In [None]:
import numpy as np
import pandas as pd
import random
import copy
from sklearn.tree import DecisionTreeRegressor
from multiprocessing import Pool
from datetime import datetime
import math

# NLTK Stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

# Fasttext
!pip install fasttext
import fasttext

# PyTorch
import torch
import torch.nn.functional as F

# To get vector embeddings
!pip install sister
import sister

# Processing Parquets
!pip install fastparquet
!unzip Training_DSF.zip

# **Preparing the Dataset**

In [None]:
sentence_embedding = sister.MeanEmbedding(lang="en")
stop_words = set(stopwords.words('english'))

import os
directory = '/content/Training_DSF'
entries = os.listdir('/content/Training_DSF')
file_count = len([item for item in os.listdir(directory) if os.path.isfile(os.path.join(directory, item))])
article_prod = pd.DataFrame(columns = [])
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_parquet('/content/Training_DSF/'+entries[i])
    article_prod=article_prod.append(temp_df)
print(article_prod  ) 
#article_prod = pd.read_parquet('/content/Training_DS.zip', engine='fastparquet')
article_prod = article_prod.drop_duplicates(subset='id', keep="first")
article_prod = article_prod.drop('mid', axis=1)
article_prod = article_prod.drop('slug', axis=1)
article_prod = article_prod.drop('imageUrl', axis=1)
article_prod = article_prod.drop('metaDescription', axis=1)
article_prod = article_prod.drop('offset', axis=1)
article_prod = article_prod.drop('partition', axis=1)
article_prod = article_prod.drop('processTimestamp', axis=1)
article_prod = article_prod.drop('contentType', axis=1)
article_prod = article_prod.drop('updatedTimestamp', axis=1)
article_prod = article_prod.drop('hour', axis=1)

article_prod = article_prod.dropna()
article_prod['headline'] = article_prod['headline'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
cleaned_text = article_prod['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
cleaned_tags = article_prod['tagName'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))

article_prod

# Creating the Headline, Text, and Tag Similarity Matrices

In [None]:
def create_matrix(list):
    tensor_list = []
    for i in list:
        tensor_list.append(torch.tensor(sentence_embedding(i)))
    # Now we have the list of tensors.
    matrix = []
    for i in tensor_list:
        tmplist = []
        for j in tensor_list:
            tmplist.append(F.cosine_similarity(i, j, dim=0))
        matrix.append(tmplist)
    return matrix
    
headline_matrix = create_matrix(article_prod['headline'])
text_matrix = create_matrix(cleaned_text)
tag_matrix = create_matrix(cleaned_tags)

# Creating the Time Difference Matrix

In [None]:
n = article_prod.shape[0]
time_matrix = [[0 for x in range(n)] for y in range(n)] 
time_diff = []
for i in range(n):
    for j in range(n):
        time_matrix[i][j]= (datetime.strptime(article_prod['date'].values[i], "%Y-%m-%d")-datetime.strptime(article_prod['date'].values[j], "%Y-%m-%d")).days

# Combining Text/Headline similarity, and Time difference

In [None]:
final_dataframe = pd.DataFrame(columns = ['qid', 'id', 'grade' ,'features'])

# TODO: Figure out a proper value for the grades.
def findGrade(num):
  if(0.00 <= num and num < 0.1175 ):
     return 0
  elif (0.1175 < num and num < 0.2212):
     return 1
  elif(0.2212 < num and num < 0.393):
     return 2
  elif(0.393< num and num < 0.632  ):
      return 3
  elif(0.632 < num and num < 0.865):
      return 4
  else:
      return 5

for i in range(n):
    for j in range(n):
        if(i==j): continue;
        tmp = {'headline_qid':article_prod['headline'].values[i],'headline_id':article_prod['headline'].values[j],'qid': article_prod['id'].values[i], 'id': article_prod['id'].values[j], 'grade': findGrade(headline_matrix[i][j].item()),'features' : [text_matrix[i][j].item(),tag_matrix[i][j].item(),math.exp(-time_matrix[i][j])]}
        final_dataframe = final_dataframe.append(tmp,ignore_index=True)
        
final_dataframe['group_id']=final_dataframe.groupby(['qid'])['id'].ngroup()
final_dataframe['last_prediction'] = 0.0
final_dataframe = final_dataframe.sort_values(['group_id', 'last_prediction'], ascending=[True, False], kind='stable')
final_dataframe

In [None]:

def compute_lambdas(lambdas_per_query):
    lambdas_per_query = lambdas_per_query.sort_values(['group_id', 'last_prediction'], ascending=[True, False], kind='stable')
    lambdas_per_query['display_rank'] = lambdas_per_query.groupby('group_id').cumcount()

    #TBD - How do generalize this to any metric?
    lambdas_per_query['discount'] = 1 / np.log2(2 + lambdas_per_query['display_rank'])
    lambdas_per_query['gain'] = (2**lambdas_per_query['grade'] - 1)

    # swaps dataframe holds each pair-wise swap computed (shrink columns for memory?)   
    # Optimization of swaps = lambdas_per_query.merge(lambdas_per_query, on='qid', how='outer')
    # to limit to just needed columns
    to_swap = lambdas_per_query[['group_id', 'display_rank', 'grade', 'last_prediction', 'discount', 'gain']]
    #to_swap = lambdas_per_query
    swaps = to_swap.merge(to_swap, on='group_id', how='outer')

    # delta - delta in DCG due to swap
    swaps['delta'] = np.abs((swaps['discount_x'] - swaps['discount_y']) * (swaps['gain_x'] - swaps['gain_y']))
    
    # rho - based on current model prediction delta
    swaps['rho'] = 1 / (1 + np.exp(swaps['last_prediction_x'] - swaps['last_prediction_y']))
    
    # If you want to be pure gradient boosting, weight reweights each models prediction
    # I haven't found this to matter in practice
    swaps['weight'] = swaps['rho'] * (1.0 - swaps['rho']) * swaps['delta']

    # Compute lambdas (the next model in ensemble's predictors) when grade_x > grade_y
    swaps['lambda'] = 0
    slice_x_better =swaps[swaps['grade_x'] > swaps['grade_y']]
    swaps.loc[swaps['grade_x'] > swaps['grade_y'], 'lambda'] = slice_x_better['delta'] * slice_x_better['rho']
    
    # accumulate lambdas and add back to model
    lambdas_x = swaps.groupby(['group_id', 'display_rank_x'])['lambda'].sum().rename('lambda')
    lambdas_y = swaps.groupby(['group_id', 'display_rank_y'])['lambda'].sum().rename('lambda')

    weights_x = swaps.groupby(['group_id', 'display_rank_x'])['weight'].sum().rename('weight')
    weights_y = swaps.groupby(['group_id', 'display_rank_y'])['weight'].sum().rename('weight')
    
    weights = weights_x + weights_y
    lambdas = lambdas_x - lambdas_y

    lambdas_per_query = lambdas_per_query.merge(lambdas, 
                                                left_on=['group_id', 'display_rank'], 
                                                right_on=['group_id', 'display_rank_x'], 
                                                how='left')
    lambdas_per_query = lambdas_per_query.merge(weights, 
                                                left_on=['group_id', 'display_rank'], 
                                                right_on=['group_id', 'display_rank_x'], 
                                                how='left')

    return lambdas_per_query

In [None]:
ensemble=[]
def lambda_mart_pure(final_dataframe, rounds=20,
                     learning_rate=0.1, max_leaf_nodes=8):

    lambdas_per_query = final_dataframe.copy()
    lambdas_per_query['last_prediction'] = 0.0

    for i in range(0, rounds):
        print(f"round {i}")

        # ------------------
        #1. Build pair-wise predictors for this round
        lambdas_per_query = compute_lambdas(lambdas_per_query)


        # ------------------
        #2. Train a regression tree on this round's lambdas
        features = lambdas_per_query['features'].tolist()
        new_tree = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
        new_tree.fit(features, lambdas_per_query['lambda'])    

        # -------------------
        #4. Add to ensemble, recreate last prediction
        ensemble.append(new_tree)
        next_predictions = new_tree.predict(features)
        lambdas_per_query['last_prediction'] += (next_predictions * learning_rate) 
        
        print(lambdas_per_query.loc[0, ['grade', 'last_prediction']])
        
        print("Train DCGs")
        lambdas_per_query['discounted_gain'] = lambdas_per_query['gain'] * lambdas_per_query['discount'] 
        dcg = lambdas_per_query[lambdas_per_query['display_rank'] < 10].groupby('qid')['discounted_gain'].sum().mean()
        print("mean   ", dcg)
        print("----------")
        
        lambdas_per_query = lambdas_per_query.drop(['lambda', 'weight'], axis=1)
    return lambdas_per_query


lambdas_per_query = lambda_mart_pure(final_dataframe=final_dataframe, rounds=50, max_leaf_nodes=10, learning_rate=0.01)
model = ensemble[len(ensemble) -1]

In [None]:
import heapq
r = final_dataframe.shape[0]
# Given an article id
def rank(article_h,m):
 H = []
 
 for i in range(r):
     if(article_h == final_dataframe.iloc[i]['headline_qid']):
         heapq.heappush(H,[-model.predict(np.array(final_dataframe.iloc[i]['features']).reshape(1, -1)),final_dataframe.iloc[i]['headline_id']])
 k_elems = []
 for i in range(m):
     k_elems.append(heapq.heappop(H))
 return k_elems  
 


In [None]:
z=np.array_split(rank('kbc 13: amitabh bachchan recalls when farah khan scolded him set',10),10)
print("the ranked list of articles is :")
print (z)


In [None]:
print("The ranked list of articles is :")
for i in range(10):
  print(i+1,". ",end="")
  print(z[i][0][1])