In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/Minor\ Project\ Sem\ 6/

/content/drive/MyDrive/Colab Notebooks/Minor Project Sem 6


# Import packages

In [3]:
import pandas as pd
import numpy as np

from ast import literal_eval # to convert array string to array
from IPython.display import clear_output # to clear the large outputs

In [4]:
# !pip install tensorflow
# !pip install tensorflow-hub
!pip install rouge
clear_output()

In [5]:
from rouge import Rouge
import tensorflow as tf

# Load features csv and articles data

In [6]:
articles_df = pd.read_csv('duc2002finaldataset_0.csv')
articles_df = articles_df.iloc[:-1, :]
articles = articles_df.drop('Summary', axis=1)
articles_df.head()

Unnamed: 0,Article,Summary
0,"['On the day of the Big Event, Ladbroke, the l...","['Penelope Lively won the 1987 Booker Prize.',..."
1,"[""Australian novelist Peter Carey was awarded ...","[""The coveted Booker Prize for the year's best..."
2,"[""Six novels have been nominated for the Booke...","[""The winner of the 1989 Booker Prize, Britain..."
3,"[""Japanese writer Kazuo Ishiguro won the 1989 ...",['It was announced Thursday that Kazuo Ishigur...
4,"[""The Booker Prize is Britain's literary event...",['The Booker Prize has become internationally ...


# Apply linear regression model

read about rfe (Recursive feture elimitation for regression)

In [9]:
# load features
features_df = pd.read_csv('features/features_with_summary_similarity.csv')

### Split data

In [10]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(features_df, groups=features_df.iloc[:, 0])
train_inds, test_inds = next(split)

train = features_df.iloc[train_inds]
test = features_df.iloc[test_inds]

X_train = train.drop(['Similarity with Summary'], axis=1)
y_train = train['Similarity with Summary']

X_test = test.drop(['Similarity with Summary'], axis=1)
y_test = test['Similarity with Summary']

In [11]:
# Now separete string data and numeric data
X_train_strings = X_train.iloc[:, :2]
X_train_data = X_train.iloc[:, 2:]

X_test_strings = X_test.iloc[:, :2]
X_test_data = X_test.iloc[:, 2:]

### Model creation - Simple linear regression

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
model = LinearRegression()

In [14]:
model.fit(X_train_data, y_train)

In [15]:
y_pred = model.predict(X_test_data)
# y_pred

## generate the predicted score for test data

Say for sentances with score > 40% are in summary

In [16]:
def predict_summary(y_pred):
  predicted_summary_sentance_nums = {} # article number: [sentence numbers]

  for i in range(len(X_test)):
    if y_pred[i] > 0.6:
      article_number = X_test.iloc[i, 0]
      sent_number = X_test.iloc[i, 1]
      if article_number not in predicted_summary_sentance_nums.keys():
        predicted_summary_sentance_nums[article_number] = []
      predicted_summary_sentance_nums[article_number].append(sent_number)
  

  # Get the sentaces for each articles
  pred_summary = {} # article number, summary
  for article_num in predicted_summary_sentance_nums.keys():
    article_num = int(article_num[1:])
    article = literal_eval(articles_df['Article'][article_num])
    pred_summary[article_num] = ""
    for sent in predicted_summary_sentance_nums["F"+str(article_num)]:
      sent_num = int(sent[1:])
      pred_summary[article_num] = pred_summary[article_num] + article[sent_num]

  return pred_summary

# Get the similarity score between predicted and actual summary using rouge 1,2,L

In [17]:
rouge = Rouge()

def similarity_using_rouge(sentance1, sentance2):
    rouge_scores = rouge.get_scores(sentance1, sentance2)[0]
    rouge_1 = rouge_scores['rouge-1']
    rouge_2 = rouge_scores['rouge-2']
    rouge_l = rouge_scores['rouge-l']
    return rouge_1, rouge_2, rouge_l
    return rouge_scores

In [18]:
def get_score_between_pred_and_act_summary(pred_summary):
  pred_actual_similarity = {}

  for article_num in pred_summary.keys():
    pred = pred_summary[article_num]

    actual_summary = literal_eval(articles_df['Summary'][article_num])
    actual_summary = "".join(actual_summary)

    pred_actual_similarity[article_num] = similarity_using_rouge(pred, actual_summary)

  return pred_actual_similarity

word2vec embedding model

fuzzy wuzzy 

autoencoder



# Creating different models

## Simple Regression prediction scores

In [19]:
predicted_summary = predict_summary(y_pred)
# predicted_summary

In [20]:
score_using_lr = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_lr

## Linear regression with rfe

In [21]:
from sklearn.feature_selection import RFE
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X_train_data, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)

Num Features: 10
Selected Features: [ True  True  True  True  True False  True False  True  True  True False
  True]


In [22]:
# Transform
X_train_data_rfe = rfe.transform(X_train_data)
X_test_data_rfe = rfe.transform(X_test_data)

In [23]:
model = LinearRegression()
# model.fit(X_train_data_rfe, y_train)
model.fit(X_train_data, y_train)

In [24]:
# y_pred = model.predict(X_test_data_rfe)
y_pred = model.predict(X_test_data)
# y_pred

In [25]:
predicted_summary = predict_summary(y_pred)
score_using_lr_rfe = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_lr_rfe

In [26]:
score_using_lr

{5: ({'r': 0.6805555555555556,
   'p': 0.2413793103448276,
   'f': 0.356363632498248},
  {'r': 0.3595505617977528,
   'p': 0.10884353741496598,
   'f': 0.16710182410869262},
  {'r': 0.6527777777777778, 'p': 0.2315270935960591, 'f': 0.3418181779527934}),
 7: ({'r': 0.3815789473684211,
   'p': 0.14646464646464646,
   'f': 0.21167882810805055},
  {'r': 0.05434782608695652,
   'p': 0.018115942028985508,
   'f': 0.02717390929347878},
  {'r': 0.3026315789473684,
   'p': 0.11616161616161616,
   'f': 0.1678832076700944}),
 11: ({'r': 0.5517241379310345,
   'p': 0.16842105263157894,
   'f': 0.25806451254552554},
  {'r': 0.32, 'p': 0.07048458149779736, 'f': 0.115523462745507},
  {'r': 0.5287356321839081,
   'p': 0.16140350877192983,
   'f': 0.2473118243734825}),
 14: ({'r': 0.6744186046511628,
   'p': 0.2230769230769231,
   'f': 0.33526011187142907},
  {'r': 0.4752475247524752, 'p': 0.125, 'f': 0.19793814103228827},
  {'r': 0.6627906976744186,
   'p': 0.21923076923076923,
   'f': 0.3294797650506

## Different Models

In [27]:
# using svm
from sklearn.svm import SVR
model = SVR()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_svm = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_svm

In [28]:
# using random forest
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_rf = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_rf

In [29]:
# using decision tree
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_dt = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_dt

In [30]:
# using xgboost
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_xgb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_xgb
predicted_summary

{5: "Critic and novelist A.S. Byatt on Tuesday won the Booker Prize, Britain's most prestigious literary award, for her tale of two young scholars investigating the lives of a pair of imaginary Victorian poets.The five judges deliberated two hours before awarding the $39,000 prize to Antonia Byatt for ``Possession,'' one of six finalists in the 21-year-old competition.Television executive Sir Denis Forman, chairman of the judge's panel, announced the prize at a banquet in London's Elizabethan Guildhall, seat of the city's Lord Mayor.``There was very strong individual support for several books on the short list and finally `Possession' by A.S. Byatt was the winner by a majority vote,'' said the judges.Ms. Byatt, who earlier this month won the $43,000 Irish Times-Aer Lingus prize for international fiction for the same work, has published five novels since 1964.In ``Possession,'' she tells how two graduate students piece together the relationship and lives of two imaginary Victorian poets

In [31]:
# using gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_gb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_gb

In [32]:
#  using ada boost
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_ab = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_ab

In [33]:
# using knn
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_knn = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_knn

## Save the output scores in csv file

In [34]:
import inspect


def retrieve_name(var):
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

In [35]:
# Save the avverage score of each model

import pandas as pd
df = pd.DataFrame(columns=['rouge-1 r', 'rouge-1 p', 'rouge-1 f', 'rouge-2 r', 'rouge-2 p', 'rouge-2 f', 'rouge-l r', 'rouge-l p', 'rouge-l f'])

# for each model
models = [score_using_lr, score_using_svm, score_using_rf, score_using_dt, score_using_xgb, score_using_knn, score_using_ab]
for i in range(len(models)):
    # Calculate the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    rouge_1_r, rouge_1_p, rouge_1_f = 0, 0, 0
    rouge_2_r, rouge_2_p, rouge_2_f = 0, 0, 0
    rouge_l_r, rouge_l_p, rouge_l_f = 0, 0, 0
    for article_num in models[i]:
        rouge_1_r += models[i][article_num][0]['r']
        rouge_1_p += models[i][article_num][0]['p']
        rouge_1_f += models[i][article_num][0]['f']
        rouge_2_r += models[i][article_num][1]['r']
        rouge_2_p += models[i][article_num][1]['p']
        rouge_2_f += models[i][article_num][1]['f']
        rouge_l_r += models[i][article_num][2]['r']
        rouge_l_p += models[i][article_num][2]['p']
        rouge_l_f += models[i][article_num][2]['f']
    
    rouge_1_r /= len(models[i])
    rouge_1_p /= len(models[i])
    rouge_1_f /= len(models[i])
    rouge_2_r /= len(models[i])
    rouge_2_p /= len(models[i])
    rouge_2_f /= len(models[i])
    rouge_l_r /= len(models[i])
    rouge_l_p /= len(models[i])
    rouge_l_f /= len(models[i])

    # Save the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    df.loc[retrieve_name(models[i]).split('_')[-1]] = [rouge_1_r, rouge_1_p, rouge_1_f, rouge_2_r, rouge_2_p, rouge_2_f, rouge_l_r, rouge_l_p, rouge_l_f]

In [36]:
df

Unnamed: 0,rouge-1 r,rouge-1 p,rouge-1 f,rouge-2 r,rouge-2 p,rouge-2 f,rouge-l r,rouge-l p,rouge-l f
,0.601516,0.258061,0.352336,0.321807,0.119099,0.168502,0.560685,0.240835,0.328606
svm,0.614289,0.254674,0.351421,0.33392,0.118344,0.169503,0.573006,0.237956,0.328077
rf,0.624281,0.252407,0.349485,0.339895,0.116264,0.167736,0.583554,0.235807,0.326596
dt,0.593349,0.253727,0.345554,0.316222,0.117333,0.165629,0.555177,0.237491,0.323421
xgb,0.630758,0.247735,0.347968,0.345976,0.11514,0.168412,0.59087,0.232083,0.325957
knn,0.627127,0.251652,0.351169,0.343194,0.118183,0.171339,0.587361,0.23616,0.329324
ab,0.538453,0.274265,0.351615,0.276918,0.122134,0.163978,0.495681,0.252645,0.323818


In [37]:
df.transpose()

Unnamed: 0,Unnamed: 1,svm,rf,dt,xgb,knn,ab
rouge-1 r,0.601516,0.614289,0.624281,0.593349,0.630758,0.627127,0.538453
rouge-1 p,0.258061,0.254674,0.252407,0.253727,0.247735,0.251652,0.274265
rouge-1 f,0.352336,0.351421,0.349485,0.345554,0.347968,0.351169,0.351615
rouge-2 r,0.321807,0.33392,0.339895,0.316222,0.345976,0.343194,0.276918
rouge-2 p,0.119099,0.118344,0.116264,0.117333,0.11514,0.118183,0.122134
rouge-2 f,0.168502,0.169503,0.167736,0.165629,0.168412,0.171339,0.163978
rouge-l r,0.560685,0.573006,0.583554,0.555177,0.59087,0.587361,0.495681
rouge-l p,0.240835,0.237956,0.235807,0.237491,0.232083,0.23616,0.252645
rouge-l f,0.328606,0.328077,0.326596,0.323421,0.325957,0.329324,0.323818


In [38]:
df.transpose().to_csv('output/average_rouge_score.csv')