# **Importing libraries**

In [1]:
import numpy as np
import pandas as pd
import random
import copy
from sklearn.tree import DecisionTreeRegressor
from multiprocessing import Pool
from datetime import datetime
import math

# NLTK Stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

# Fasttext
!pip install fasttext
import fasttext

# PyTorch
import torch
import torch.nn.functional as F

# To get vector embeddings
!pip install sister
import sister

# Processing Parquets
!pip install fastparquet
!unzip Training_DSF.zip

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 7.1 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3161975 sha256=d2de98292ed4a81ac8cc54a91466ff17c9f3da1233def4a3d7367fc9565cef80
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sister
  Downloading sister-0.1.10.tar.gz (24 kB)
Collecting Ja

# **Preparing the Dataset**

In [2]:
sentence_embedding = sister.MeanEmbedding(lang="en")
stop_words = set(stopwords.words('english'))

import os
directory = '/content/Training_DSF'
entries = os.listdir('/content/Training_DSF')
file_count = len([item for item in os.listdir(directory) if os.path.isfile(os.path.join(directory, item))])
article_prod = pd.DataFrame(columns = [])
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_parquet('/content/Training_DSF/'+entries[i])
    article_prod=article_prod.append(temp_df)
print(article_prod  ) 
#article_prod = pd.read_parquet('/content/Training_DS.zip', engine='fastparquet')
article_prod = article_prod.drop_duplicates(subset='id', keep="first")
article_prod = article_prod.drop('mid', axis=1)
article_prod = article_prod.drop('slug', axis=1)
article_prod = article_prod.drop('imageUrl', axis=1)
article_prod = article_prod.drop('metaDescription', axis=1)
article_prod = article_prod.drop('offset', axis=1)
article_prod = article_prod.drop('partition', axis=1)
article_prod = article_prod.drop('processTimestamp', axis=1)
article_prod = article_prod.drop('contentType', axis=1)
article_prod = article_prod.drop('updatedTimestamp', axis=1)
article_prod = article_prod.drop('hour', axis=1)

article_prod = article_prod.dropna()
article_prod['headline'] = article_prod['headline'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
cleaned_text = article_prod['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
cleaned_tags = article_prod['tagName'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))

article_prod

Downloading from https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip...


Loading model...




                                       id     mid  \
0    d2db6348-a43d-44ab-8338-3dc83f7f7528  130915   
1    d2db6348-a43d-44ab-8338-3dc83f7f7528  130915   
2    d2db6348-a43d-44ab-8338-3dc83f7f7528  130915   
3    a011dd67-2799-4f51-bf41-183b5e1ce181  130915   
4    a011dd67-2799-4f51-bf41-183b5e1ce181  130915   
..                                    ...     ...   
161  fe86068c-837e-4ccf-886a-9cc18b30e683  130915   
162  164200d2-05dc-4508-9e53-b3ea23c0f873  130915   
163  164200d2-05dc-4508-9e53-b3ea23c0f873  130915   
164  2f3a486f-a9ef-4dd4-9f32-4b12f0378507  130915   
165  2f3a486f-a9ef-4dd4-9f32-4b12f0378507  130915   

                                              headline  \
0    Taliban Executes Brother of Former Afghan Vice...   
1    Taliban Executes Brother of Former Afghan Vice...   
2    Taliban Executes Brother of Former Afghan Vice...   
3    Indian Men’s Cricket Team’s Tryst With COVID-1...   
4    Indian Men’s Cricket Team’s Tryst With COVID-1...   
..             

Unnamed: 0,id,headline,text,authorName,publishedAt,firstPublishedAt,sectionName,tagName,date
0,d2db6348-a43d-44ab-8338-3dc83f7f7528,taliban executes brother former afghan vice-pr...,Former Afghanistan Vice-President Amrullah Sal...,The Quint,1631329689017,1631289945016,Hot-News:News:World,"Amrullah Saleh,Afghanistan Crisis,",2021-09-11
3,a011dd67-2799-4f51-bf41-183b5e1ce181,indian men’s cricket team’s tryst with covid-1...,The Indian men’s cricket team have been in Eng...,The Quint,1631332383989,1631278926700,Cricket:India:Sports,"India vs England,",2021-09-11
5,cbe5b8f3-f037-451e-ab22-208d9df333a4,10 haunting photographs 9/11 attacks,"Around 3,000 people were killed in New York Ci...",Monica Sarup,1631333531452,1599798796094,Photos:World,"9/11 attacks,20 years of 9/11,",2021-09-11
8,35c53239-7956-4fcf-aaaa-467dbcec7947,"mumbai: 30-yr-old woman raped, brutalised; acc...",One person has been arrested in connection wit...,The Quint,1631333844998,1631284879933,Breaking-News:Gender:Hot-News:News,"Women Safety,Mumbai Rape,",2021-09-11
10,969a0c45-30f0-4663-a3b8-b5860a518f2a,beyond fruity smell: ingredients floor cleaner...,Covid-19 has been a wake-up call for many Indi...,Partner,1631337932101,1631337932101,BrandStudio,"Sanitation,hygiene,Cleanliness,coronavirus,domex,",2021-09-11
13,40663950-a76e-48d1-bdee-85cf1e0e4fe3,"20 years later, america vows ‘never forget’ 9/11",Video Producer: Srishti TyagiVideo Editor: San...,Karen Matthews and Jennifer Peltz,1568271341105,1568271341105,World,"US,9/11,America,donald trump,9/11 attacks,",2021-09-11
15,a86a2e66-3457-45a9-91b1-a41e9520c60d,dowry harassment drove vismaya suicide: kerala...,Twenty-two-year old Kerala woman Vismaya V Nai...,The News Minute,1631340453736,1631340453736,India,"Dowry Deaths,Vismaya V Nair,",2021-09-11
19,5118bf67-6f24-427c-8cfc-dde65feceb74,manchester test cancelled: do india win englan...,A little less than two hours before the toss a...,The Quint,1631351098974,1631351098974,Cricket:India:Sports,"India vs England,",2021-09-11
32,88db098e-aefa-4ec1-807a-3d10e69e0106,whatsapp announces end-to-end encrypted backup...,"In a major privacy update, WhatsApp chat backu...",The Quint,1631352812572,1631352812572,Tech-News,"WhatsApp Encryption,End to End Encryption,What...",2021-09-11
34,5b2999ae-05bf-4c87-8832-3618ba74767f,how 9/11 changed world & america-pakistan-indi...,(This story has been reposted from The Quint’s...,Raghav Bahl,1631353014005,1599797503086,India:Opinion:Quint-News-Feed:The-Indian-American,"India,Pakistan,US,9/11,Raghav's Take,20 years ...",2021-09-11


# Creating the Headline, Text, and Tag Similarity Matrices

In [3]:
def create_matrix(list):
    tensor_list = []
    for i in list:
        tensor_list.append(torch.tensor(sentence_embedding(i)))
    # Now we have the list of tensors.
    matrix = []
    for i in tensor_list:
        tmplist = []
        for j in tensor_list:
            tmplist.append(F.cosine_similarity(i, j, dim=0))
        matrix.append(tmplist)
    return matrix
    
headline_matrix = create_matrix(article_prod['headline'])
text_matrix = create_matrix(cleaned_text)
tag_matrix = create_matrix(cleaned_tags)

# Creating the Time Difference Matrix

In [4]:
n = article_prod.shape[0]
time_matrix = [[0 for x in range(n)] for y in range(n)] 
time_diff = []
for i in range(n):
    for j in range(n):
        time_matrix[i][j]= (datetime.strptime(article_prod['date'].values[i], "%Y-%m-%d")-datetime.strptime(article_prod['date'].values[j], "%Y-%m-%d")).days

# Combining Text/Headline similarity, and Time difference

In [5]:
final_dataframe = pd.DataFrame(columns = ['qid', 'id', 'grade' ,'features'])

# TODO: Figure out a proper value for the grades.
def findGrade(num):
  if(0.00 <= num and num < 0.1175 ):
     return 0
  elif (0.1175 < num and num < 0.2212):
     return 1
  elif(0.2212 < num and num < 0.393):
     return 2
  elif(0.393< num and num < 0.632  ):
      return 3
  elif(0.632 < num and num < 0.865):
      return 4
  else:
      return 5

for i in range(n):
    for j in range(n):
        if(i==j): continue;
        tmp = {'headline_qid':article_prod['headline'].values[i],'headline_id':article_prod['headline'].values[j],'qid': article_prod['id'].values[i], 'id': article_prod['id'].values[j], 'grade': findGrade(headline_matrix[i][j].item()),'features' : [text_matrix[i][j].item(),tag_matrix[i][j].item(),math.exp(-time_matrix[i][j])]}
        final_dataframe = final_dataframe.append(tmp,ignore_index=True)
        
final_dataframe['group_id']=final_dataframe.groupby(['qid'])['id'].ngroup()
final_dataframe['last_prediction'] = 0.0
final_dataframe = final_dataframe.sort_values(['group_id', 'last_prediction'], ascending=[True, False], kind='stable')
final_dataframe

Unnamed: 0,qid,id,grade,features,headline_id,headline_qid,group_id,last_prediction
1776,02547bd8-2264-4cd1-bd0c-a9941b8754b8,d2db6348-a43d-44ab-8338-3dc83f7f7528,3,"[0.9179891347885132, 0.8888632655143738, 7.389...",taliban executes brother former afghan vice-pr...,no phd master's degree valuable today: taliban...,0,0.0
1777,02547bd8-2264-4cd1-bd0c-a9941b8754b8,a011dd67-2799-4f51-bf41-183b5e1ce181,3,"[0.9084380269050598, 0.4314965307712555, 7.389...",indian men’s cricket team’s tryst with covid-1...,no phd master's degree valuable today: taliban...,0,0.0
1778,02547bd8-2264-4cd1-bd0c-a9941b8754b8,cbe5b8f3-f037-451e-ab22-208d9df333a4,3,"[0.848663330078125, 0.4284764528274536, 7.3890...",10 haunting photographs 9/11 attacks,no phd master's degree valuable today: taliban...,0,0.0
1779,02547bd8-2264-4cd1-bd0c-a9941b8754b8,35c53239-7956-4fcf-aaaa-467dbcec7947,3,"[0.9111664295196533, 0.5041289925575256, 7.389...","mumbai: 30-yr-old woman raped, brutalised; acc...",no phd master's degree valuable today: taliban...,0,0.0
1780,02547bd8-2264-4cd1-bd0c-a9941b8754b8,969a0c45-30f0-4663-a3b8-b5860a518f2a,3,"[0.818446695804596, 0.46078506112098694, 7.389...",beyond fruity smell: ingredients floor cleaner...,no phd master's degree valuable today: taliban...,0,0.0
...,...,...,...,...,...,...,...,...
2251,fe86068c-837e-4ccf-886a-9cc18b30e683,c39b3862-8306-4f8d-8110-0d4457325ff4,4,"[0.92011559009552, 0.626268208026886, 1.0]",'we are still amid second covid wave': health ...,review: 'once upon time calcutta' tenderly cap...,48,0.0
2252,fe86068c-837e-4ccf-886a-9cc18b30e683,c0ee6c2a-06e0-49bd-9228-cc5a31d44439,4,"[0.8891655206680298, 0.5255205035209656, 1.0]",jee main 2021 session 4 result to be declared ...,review: 'once upon time calcutta' tenderly cap...,48,0.0
2253,fe86068c-837e-4ccf-886a-9cc18b30e683,483a34df-bca9-4721-b5b8-64de331333ca,3,"[0.8963913321495056, 0.7311475872993469, 1.0]",gyanvapi mosque: allahabad hc stays varanasi c...,review: 'once upon time calcutta' tenderly cap...,48,0.0
2254,fe86068c-837e-4ccf-886a-9cc18b30e683,164200d2-05dc-4508-9e53-b3ea23c0f873,4,"[0.9430419206619263, 0.6924434900283813, 1.0]",'save us': scary scenes mystery fever grips ut...,review: 'once upon time calcutta' tenderly cap...,48,0.0


In [6]:

def compute_lambdas(lambdas_per_query):
    lambdas_per_query = lambdas_per_query.sort_values(['group_id', 'last_prediction'], ascending=[True, False], kind='stable')
    lambdas_per_query['display_rank'] = lambdas_per_query.groupby('group_id').cumcount()

    #TBD - How do generalize this to any metric?
    lambdas_per_query['discount'] = 1 / np.log2(2 + lambdas_per_query['display_rank'])
    lambdas_per_query['gain'] = (2**lambdas_per_query['grade'] - 1)

    # swaps dataframe holds each pair-wise swap computed (shrink columns for memory?)   
    # Optimization of swaps = lambdas_per_query.merge(lambdas_per_query, on='qid', how='outer')
    # to limit to just needed columns
    to_swap = lambdas_per_query[['group_id', 'display_rank', 'grade', 'last_prediction', 'discount', 'gain']]
    #to_swap = lambdas_per_query
    swaps = to_swap.merge(to_swap, on='group_id', how='outer')

    # delta - delta in DCG due to swap
    swaps['delta'] = np.abs((swaps['discount_x'] - swaps['discount_y']) * (swaps['gain_x'] - swaps['gain_y']))
    
    # rho - based on current model prediction delta
    swaps['rho'] = 1 / (1 + np.exp(swaps['last_prediction_x'] - swaps['last_prediction_y']))
    
    # If you want to be pure gradient boosting, weight reweights each models prediction
    # I haven't found this to matter in practice
    swaps['weight'] = swaps['rho'] * (1.0 - swaps['rho']) * swaps['delta']

    # Compute lambdas (the next model in ensemble's predictors) when grade_x > grade_y
    swaps['lambda'] = 0
    slice_x_better =swaps[swaps['grade_x'] > swaps['grade_y']]
    swaps.loc[swaps['grade_x'] > swaps['grade_y'], 'lambda'] = slice_x_better['delta'] * slice_x_better['rho']
    
    # accumulate lambdas and add back to model
    lambdas_x = swaps.groupby(['group_id', 'display_rank_x'])['lambda'].sum().rename('lambda')
    lambdas_y = swaps.groupby(['group_id', 'display_rank_y'])['lambda'].sum().rename('lambda')

    weights_x = swaps.groupby(['group_id', 'display_rank_x'])['weight'].sum().rename('weight')
    weights_y = swaps.groupby(['group_id', 'display_rank_y'])['weight'].sum().rename('weight')
    
    weights = weights_x + weights_y
    lambdas = lambdas_x - lambdas_y

    lambdas_per_query = lambdas_per_query.merge(lambdas, 
                                                left_on=['group_id', 'display_rank'], 
                                                right_on=['group_id', 'display_rank_x'], 
                                                how='left')
    lambdas_per_query = lambdas_per_query.merge(weights, 
                                                left_on=['group_id', 'display_rank'], 
                                                right_on=['group_id', 'display_rank_x'], 
                                                how='left')

    return lambdas_per_query

In [7]:
ensemble=[]
def lambda_mart_pure(final_dataframe, rounds=20,
                     learning_rate=0.1, max_leaf_nodes=8):

    lambdas_per_query = final_dataframe.copy()
    lambdas_per_query['last_prediction'] = 0.0

    for i in range(0, rounds):
        print(f"round {i}")

        # ------------------
        #1. Build pair-wise predictors for this round
        lambdas_per_query = compute_lambdas(lambdas_per_query)


        # ------------------
        #2. Train a regression tree on this round's lambdas
        features = lambdas_per_query['features'].tolist()
        new_tree = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
        new_tree.fit(features, lambdas_per_query['lambda'])    

        # -------------------
        #4. Add to ensemble, recreate last prediction
        ensemble.append(new_tree)
        next_predictions = new_tree.predict(features)
        lambdas_per_query['last_prediction'] += (next_predictions * learning_rate) 
        
        print(lambdas_per_query.loc[0, ['grade', 'last_prediction']])
        
        print("Train DCGs")
        lambdas_per_query['discounted_gain'] = lambdas_per_query['gain'] * lambdas_per_query['discount'] 
        dcg = lambdas_per_query[lambdas_per_query['display_rank'] < 10].groupby('qid')['discounted_gain'].sum().mean()
        print("mean   ", dcg)
        print("----------")
        
        lambdas_per_query = lambdas_per_query.drop(['lambda', 'weight'], axis=1)
    return lambdas_per_query


lambdas_per_query = lambda_mart_pure(final_dataframe=final_dataframe, rounds=50, max_leaf_nodes=10, learning_rate=0.01)
model = ensemble[len(ensemble) -1]

round 0
grade                     3
last_prediction    0.166165
Name: 0, dtype: object
Train DCGs
mean    40.75633230942108
----------
round 1
grade                     3
last_prediction    0.204197
Name: 0, dtype: object
Train DCGs
mean    49.4376976504061
----------
round 2
grade                     3
last_prediction   -0.049018
Name: 0, dtype: object
Train DCGs
mean    53.795958230677456
----------
round 3
grade                     3
last_prediction    0.120857
Name: 0, dtype: object
Train DCGs
mean    54.10998868503444
----------
round 4
grade                     3
last_prediction   -0.035595
Name: 0, dtype: object
Train DCGs
mean    53.94067791248284
----------
round 5
grade                     3
last_prediction    0.238209
Name: 0, dtype: object
Train DCGs
mean    54.460077030478026
----------
round 6
grade                     3
last_prediction    0.293041
Name: 0, dtype: object
Train DCGs
mean    54.34739951541245
----------
round 7
grade                     3
last_prediction   

In [8]:
import heapq
r = final_dataframe.shape[0]
# Given an article id
def rank(article_h,m):
 H = []
 
 for i in range(r):
     if(article_h == final_dataframe.iloc[i]['headline_qid']):
         heapq.heappush(H,[-model.predict(np.array(final_dataframe.iloc[i]['features']).reshape(1, -1)),final_dataframe.iloc[i]['headline_id']])
 k_elems = []
 for i in range(m):
     k_elems.append(heapq.heappop(H))
 return k_elems  
 


In [9]:
z=np.array_split(rank('kbc 13: amitabh bachchan recalls when farah khan scolded him set',10),10)
print("the ranked list of articles is :")
print (z)


the ranked list of articles is :
[array([[array([-3.2276144]),
        'captain vikram batra, the ‘shershaah’ pakistan army feared during kargil war']],
      dtype=object), array([[array([-3.2276144]),
        'only parents have right: saba ali khan kareena-saif naming their son jeh']],
      dtype=object), array([[array([-3.2276144]),
        "review: 'laabam' serves communism on a fast-food platter"]],
      dtype=object), array([[array([-3.2276144]),
        "review: 'once upon time calcutta' tenderly captures city flux"]],
      dtype=object), array([[array([-3.2276144]),
        "watch trailer priyanka, keanu reeves' 'the matrix resurrections'"]],
      dtype=object), array([[array([-0.37307623]),
        "'no room for sexual harassers': journos ask wion end mj akbar's employment"]],
      dtype=object), array([[array([-0.37307623]),
        "indian squad for 2021 men's t20 wc: dhawan out, ashwin in & dhoni mentor team"]],
      dtype=object), array([[array([-0.09725957]),
      

  result = getattr(asarray(obj), method)(*args, **kwds)


In [11]:
print("The ranked list of articles is :")
for i in range(10):
  print(i+1,". ",end="")
  print(z[i][0][1])

The ranked list of articles is :
1 . captain vikram batra, the ‘shershaah’ pakistan army feared during kargil war
2 . only parents have right: saba ali khan kareena-saif naming their son jeh
3 . review: 'laabam' serves communism on a fast-food platter
4 . review: 'once upon time calcutta' tenderly captures city flux
5 . watch trailer priyanka, keanu reeves' 'the matrix resurrections'
6 . 'no room for sexual harassers': journos ask wion end mj akbar's employment
7 . indian squad for 2021 men's t20 wc: dhawan out, ashwin in & dhoni mentor team
8 . 'save us': scary scenes mystery fever grips uttar pradesh
9 . 'we are still amid second covid wave': health secretary
10 . carbon dating reveals 3,200-year-old civilisation tamil nadu: mk stalin
