In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/Minor\ Project\ Sem\ 6/

/content/drive/MyDrive/Colab Notebooks/Minor Project Sem 6


# Import packages

In [3]:
import pandas as pd
import numpy as np

from ast import literal_eval # to convert array string to array
from IPython.display import clear_output # to clear the large outputs

In [4]:
# !pip install tensorflow
# !pip install tensorflow-hub
!pip install rouge
clear_output()

In [5]:
from rouge import Rouge
import tensorflow as tf

# Load features csv and articles data

In [6]:
articles_df = pd.read_csv('duc2002finaldataset_0.csv')
articles_df = articles_df.iloc[:-1, :]
articles = articles_df.drop('Summary', axis=1)
articles_df.head()

Unnamed: 0,Article,Summary
0,"['On the day of the Big Event, Ladbroke, the l...","['Penelope Lively won the 1987 Booker Prize.',..."
1,"[""Australian novelist Peter Carey was awarded ...","[""The coveted Booker Prize for the year's best..."
2,"[""Six novels have been nominated for the Booke...","[""The winner of the 1989 Booker Prize, Britain..."
3,"[""Japanese writer Kazuo Ishiguro won the 1989 ...",['It was announced Thursday that Kazuo Ishigur...
4,"[""The Booker Prize is Britain's literary event...",['The Booker Prize has become internationally ...


# Apply linear regression model

read about rfe (Recursive feture elimitation for regression)

In [7]:
# load features
features_df = pd.read_csv('features/embeddings_using_word2vec_and_autoencoder.csv')

### Split data

In [8]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(features_df, groups=features_df.iloc[:, 0])
train_inds, test_inds = next(split)

train = features_df.iloc[train_inds]
test = features_df.iloc[test_inds]

X_train = train.drop(['Similarity with Summary'], axis=1)
y_train = train['Similarity with Summary']

X_test = test.drop(['Similarity with Summary'], axis=1)
y_test = test['Similarity with Summary']

In [9]:
# Now separete string data and numeric data
X_train_strings = X_train.iloc[:, :2]
X_train_data = X_train.iloc[:, 2:]

X_test_strings = X_test.iloc[:, :2]
X_test_data = X_test.iloc[:, 2:]

### Model creation - Simple linear regression

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
model = LinearRegression()

In [12]:
model.fit(X_train_data, y_train)

In [13]:
y_pred = model.predict(X_test_data)
# y_pred

## generate the predicted score for test data

Say for sentances with score > 40% are in summary

In [14]:
def predict_summary(y_pred):
  predicted_summary_sentance_nums = {} # article number: [sentence numbers]

  for i in range(len(X_test)):
    if y_pred[i] > 0.6:
      article_number = X_test.iloc[i, 0]
      sent_number = X_test.iloc[i, 1]
      if article_number not in predicted_summary_sentance_nums.keys():
        predicted_summary_sentance_nums[article_number] = []
      predicted_summary_sentance_nums[article_number].append(sent_number)
  

  # Get the sentaces for each articles
  pred_summary = {} # article number, summary
  for article_num in predicted_summary_sentance_nums.keys():
    article_num = int(article_num[1:])
    article = literal_eval(articles_df['Article'][article_num])
    pred_summary[article_num] = ""
    for sent in predicted_summary_sentance_nums["F"+str(article_num)]:
      sent_num = int(sent[1:])
      pred_summary[article_num] = pred_summary[article_num] + article[sent_num]

  return pred_summary

# Get the similarity score between predicted and actual summary using rouge 1,2,L

In [15]:
rouge = Rouge()

def similarity_using_rouge(sentance1, sentance2):
    rouge_scores = rouge.get_scores(sentance1, sentance2)[0]
    rouge_1 = rouge_scores['rouge-1']
    rouge_2 = rouge_scores['rouge-2']
    rouge_l = rouge_scores['rouge-l']
    return rouge_1, rouge_2, rouge_l
    return rouge_scores

In [16]:
def get_score_between_pred_and_act_summary(pred_summary):
  pred_actual_similarity = {}

  for article_num in pred_summary.keys():
    pred = pred_summary[article_num]

    actual_summary = literal_eval(articles_df['Summary'][article_num])
    actual_summary = "".join(actual_summary)

    pred_actual_similarity[article_num] = similarity_using_rouge(pred, actual_summary)

  return pred_actual_similarity

word2vec embedding model

fuzzy wuzzy 

autoencoder



# Creating different models

## Simple Regression prediction scores

In [17]:
predicted_summary = predict_summary(y_pred)
# predicted_summary

In [18]:
score_using_lr = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_lr

## Linear regression with rfe

In [19]:
from sklearn.feature_selection import RFE
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X_train_data, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)

Num Features: 10
Selected Features: [ True False  True  True False False False False False False False False
 False  True  True False  True False False False False False  True False
 False False False False False  True False False False False False False
 False False False False False False  True False False  True False False
 False False]


In [20]:
# Transform
X_train_data_rfe = rfe.transform(X_train_data)
X_test_data_rfe = rfe.transform(X_test_data)

In [21]:
model = LinearRegression()
# model.fit(X_train_data_rfe, y_train)
model.fit(X_train_data, y_train)

In [22]:
# y_pred = model.predict(X_test_data_rfe)
y_pred = model.predict(X_test_data)
# y_pred

In [23]:
predicted_summary = predict_summary(y_pred)
score_using_lr_rfe = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_lr_rfe

In [24]:
score_using_lr

{5: ({'r': 0.5416666666666666, 'p': 0.3046875, 'f': 0.38999999539200003},
  {'r': 0.29213483146067415,
   'p': 0.14207650273224043,
   'f': 0.19117646618539152},
  {'r': 0.5, 'p': 0.28125, 'f': 0.35999999539200006}),
 7: ({'r': 0.3684210526315789,
   'p': 0.14736842105263157,
   'f': 0.21052631170784109},
  {'r': 0.05434782608695652,
   'p': 0.018656716417910446,
   'f': 0.027777773972840026},
  {'r': 0.2894736842105263,
   'p': 0.11578947368421053,
   'f': 0.1654135297529539}),
 11: ({'r': 0.6896551724137931,
   'p': 0.17045454545454544,
   'f': 0.2733485161841211},
  {'r': 0.41, 'p': 0.07130434782608695, 'f': 0.12148147895747605},
  {'r': 0.6551724137931034,
   'p': 0.16193181818181818,
   'f': 0.25968109021601177}),
 14: ({'r': 0.686046511627907,
   'p': 0.16526610644257703,
   'f': 0.2663656853587025},
  {'r': 0.4752475247524752, 'p': 0.0823327615780446, 'f': 0.14035087467584045},
  {'r': 0.6744186046511628,
   'p': 0.16246498599439776,
   'f': 0.2618510126724723}),
 19: ({'r': 0.6

## Different Models

In [25]:
# using svm
from sklearn.svm import SVR
model = SVR()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_svm = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_svm

In [26]:
# using random forest
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_rf = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_rf

In [27]:
# using decision tree
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_dt = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_dt

In [28]:
# using xgboost
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_xgb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_xgb

In [29]:
# using gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_gb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_gb

In [30]:
#  using ada boost
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_ab = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_ab

In [31]:
# using knn
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_knn = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_knn

## Save the output scores in csv file

In [32]:

import inspect


def retrieve_name(var):
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

In [39]:
# Save the avverage score of each model

import pandas as pd
df = pd.DataFrame(columns=['rouge-1 r', 'rouge-1 p', 'rouge-1 f', 'rouge-2 r', 'rouge-2 p', 'rouge-2 f', 'rouge-l r', 'rouge-l p', 'rouge-l f'])

# for each model
models = [score_using_lr, score_using_svm, score_using_rf, score_using_dt, score_using_xgb, score_using_knn, score_using_ab]
for i in range(len(models)):
    # Calculate the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    rouge_1_r, rouge_1_p, rouge_1_f = 0, 0, 0
    rouge_2_r, rouge_2_p, rouge_2_f = 0, 0, 0
    rouge_l_r, rouge_l_p, rouge_l_f = 0, 0, 0
    for article_num in models[i]:
        rouge_1_r += models[i][article_num][0]['r']
        rouge_1_p += models[i][article_num][0]['p']
        rouge_1_f += models[i][article_num][0]['f']
        rouge_2_r += models[i][article_num][1]['r']
        rouge_2_p += models[i][article_num][1]['p']
        rouge_2_f += models[i][article_num][1]['f']
        rouge_l_r += models[i][article_num][2]['r']
        rouge_l_p += models[i][article_num][2]['p']
        rouge_l_f += models[i][article_num][2]['f']
    
    rouge_1_r /= len(models[i])
    rouge_1_p /= len(models[i])
    rouge_1_f /= len(models[i])
    rouge_2_r /= len(models[i])
    rouge_2_p /= len(models[i])
    rouge_2_f /= len(models[i])
    rouge_l_r /= len(models[i])
    rouge_l_p /= len(models[i])
    rouge_l_f /= len(models[i])

    # Save the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    df.loc[retrieve_name(models[i]).split('_')[-1]] = [rouge_1_r, rouge_1_p, rouge_1_f, rouge_2_r, rouge_2_p, rouge_2_f, rouge_l_r, rouge_l_p, rouge_l_f]

In [40]:
df

Unnamed: 0,rouge-1 r,rouge-1 p,rouge-1 f,rouge-2 r,rouge-2 p,rouge-2 f,rouge-l r,rouge-l p,rouge-l f
lr,0.593056,0.257131,0.345326,0.311412,0.116952,0.162546,0.553041,0.239034,0.321371
svm,0.604165,0.249461,0.340105,0.315397,0.11144,0.157976,0.563365,0.232029,0.316716
rf,0.589574,0.256991,0.343652,0.309318,0.116157,0.160668,0.549976,0.239131,0.31994
dt,0.572478,0.253614,0.33904,0.291081,0.111582,0.154786,0.533799,0.235918,0.315686
xgb,0.604599,0.253593,0.345142,0.318677,0.1145,0.16206,0.564965,0.23641,0.322122
knn,0.569967,0.254388,0.338007,0.291245,0.113082,0.155244,0.531201,0.236664,0.314628
ab,0.540221,0.269517,0.345238,0.275181,0.120961,0.160313,0.499872,0.24884,0.319014


In [41]:
df.transpose()

Unnamed: 0,lr,svm,rf,dt,xgb,knn,ab
rouge-1 r,0.593056,0.604165,0.589574,0.572478,0.604599,0.569967,0.540221
rouge-1 p,0.257131,0.249461,0.256991,0.253614,0.253593,0.254388,0.269517
rouge-1 f,0.345326,0.340105,0.343652,0.33904,0.345142,0.338007,0.345238
rouge-2 r,0.311412,0.315397,0.309318,0.291081,0.318677,0.291245,0.275181
rouge-2 p,0.116952,0.11144,0.116157,0.111582,0.1145,0.113082,0.120961
rouge-2 f,0.162546,0.157976,0.160668,0.154786,0.16206,0.155244,0.160313
rouge-l r,0.553041,0.563365,0.549976,0.533799,0.564965,0.531201,0.499872
rouge-l p,0.239034,0.232029,0.239131,0.235918,0.23641,0.236664,0.24884
rouge-l f,0.321371,0.316716,0.31994,0.315686,0.322122,0.314628,0.319014


In [42]:
df.transpose().to_csv('output/average_rouge_score_autoencoder_and_word2vec.csv')