In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/Minor\ Project\ Sem\ 6/

/content/drive/MyDrive/Colab Notebooks/Minor Project Sem 6


# Import packages

In [3]:
import pandas as pd
import numpy as np

from ast import literal_eval # to convert array string to array
from IPython.display import clear_output # to clear the large outputs

In [4]:
# !pip install tensorflow
# !pip install tensorflow-hub
!pip install rouge
clear_output()

In [5]:
from rouge import Rouge
import tensorflow as tf

# Load features csv and articles data

In [6]:
articles_df = pd.read_csv('duc2002finaldataset_0.csv')
articles_df = articles_df.iloc[:-1, :]
articles = articles_df.drop('Summary', axis=1)
articles_df.head()

Unnamed: 0,Article,Summary
0,"['On the day of the Big Event, Ladbroke, the l...","['Penelope Lively won the 1987 Booker Prize.',..."
1,"[""Australian novelist Peter Carey was awarded ...","[""The coveted Booker Prize for the year's best..."
2,"[""Six novels have been nominated for the Booke...","[""The winner of the 1989 Booker Prize, Britain..."
3,"[""Japanese writer Kazuo Ishiguro won the 1989 ...",['It was announced Thursday that Kazuo Ishigur...
4,"[""The Booker Prize is Britain's literary event...",['The Booker Prize has become internationally ...


# Apply linear regression model

read about rfe (Recursive feture elimitation for regression)

In [7]:
# load features
features_df = pd.read_csv('features_with_summary_similarity.csv')

### Split data

In [8]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(features_df, groups=features_df.iloc[:, 0])
train_inds, test_inds = next(split)

train = features_df.iloc[train_inds]
test = features_df.iloc[test_inds]

X_train = train.drop(['Similarity with Summary'], axis=1)
y_train = train['Similarity with Summary']

X_test = test.drop(['Similarity with Summary'], axis=1)
y_test = test['Similarity with Summary']

In [9]:
# Now separete string data and numeric data
X_train_strings = X_train.iloc[:, :2]
X_train_data = X_train.iloc[:, 2:]

X_test_strings = X_test.iloc[:, :2]
X_test_data = X_test.iloc[:, 2:]

## Function to generate the predicted score for test data

Say for sentances with score > 40% are in summary

In [10]:
def predict_summary(y_pred):
  predicted_summary_sentance_nums = {} # article number: [sentence numbers]

  for i in range(len(X_test)):
    if y_pred[i] > 0.6:
      article_number = X_test.iloc[i, 0]
      sent_number = X_test.iloc[i, 1]
      if article_number not in predicted_summary_sentance_nums.keys():
        predicted_summary_sentance_nums[article_number] = []
      predicted_summary_sentance_nums[article_number].append(sent_number)
  

  # Get the sentaces for each articles
  pred_summary = {} # article number, summary
  for article_num in predicted_summary_sentance_nums.keys():
    article_num = int(article_num[1:])
    article = literal_eval(articles_df['Article'][article_num])
    pred_summary[article_num] = ""
    for sent in predicted_summary_sentance_nums["F"+str(article_num)]:
      sent_num = int(sent[1:])
      pred_summary[article_num] = pred_summary[article_num] + article[sent_num]

  return pred_summary

# Get the similarity score between predicted and actual summary using rouge 1,2,L

In [11]:
rouge = Rouge()

def similarity_using_rouge(sentance1, sentance2):
    rouge_scores = rouge.get_scores(sentance1, sentance2)[0]
    rouge_1 = rouge_scores['rouge-1']
    rouge_2 = rouge_scores['rouge-2']
    rouge_l = rouge_scores['rouge-l']
    return rouge_1, rouge_2, rouge_l
    return rouge_scores

In [12]:
def get_score_between_pred_and_act_summary(pred_summary):
  pred_actual_similarity = {}

  for article_num in pred_summary.keys():
    pred = pred_summary[article_num]

    actual_summary = literal_eval(articles_df['Summary'][article_num])
    actual_summary = "".join(actual_summary)

    pred_actual_similarity[article_num] = similarity_using_rouge(pred, actual_summary)

  return pred_actual_similarity

# Creating different models

## Linear regression with rfe

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

model = LinearRegression()
rfe = RFE(model, n_features_to_select=6)
fit = rfe.fit(X_train_data, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)

Num Features: 6
Selected Features: [ True False  True  True False False False False  True  True  True False
 False]


In [14]:
# Transform
X_train_data_rfe = rfe.transform(X_train_data)
X_test_data_rfe = rfe.transform(X_test_data)

In [15]:
model.fit(X_train_data_rfe, y_train)

In [16]:
y_pred = model.predict(X_test_data_rfe)
# y_pred

In [17]:
predicted_summary = predict_summary(y_pred)
score_using_lr = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_lr_rfe

In [18]:
score_using_lr

{5: ({'r': 0.6805555555555556,
   'p': 0.25925925925925924,
   'f': 0.37547892320782134},
  {'r': 0.3595505617977528, 'p': 0.1198501872659176, 'f': 0.17977527714887648},
  {'r': 0.6527777777777778,
   'p': 0.24867724867724866,
   'f': 0.3601532527097371}),
 7: ({'r': 0.2631578947368421,
   'p': 0.12269938650306748,
   'f': 0.16736401239894272},
  {'r': 0.05434782608695652,
   'p': 0.02336448598130841,
   'f': 0.03267973435687183},
  {'r': 0.2236842105263158,
   'p': 0.10429447852760736,
   'f': 0.14225940988848249}),
 11: ({'r': 0.5517241379310345,
   'p': 0.16842105263157894,
   'f': 0.25806451254552554},
  {'r': 0.32, 'p': 0.07048458149779736, 'f': 0.115523462745507},
  {'r': 0.5287356321839081,
   'p': 0.16140350877192983,
   'f': 0.2473118243734825}),
 14: ({'r': 0.6976744186046512,
   'p': 0.1892744479495268,
   'f': 0.2977667460224495},
  {'r': 0.48514851485148514, 'p': 0.1016597510373444, 'f': 0.1680960520239255},
  {'r': 0.6744186046511628,
   'p': 0.1829652996845426,
   'f': 0

## Different Models

In [19]:
# using svm
from sklearn.svm import SVR
model = SVR()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_svm = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_svm

In [20]:
# using random forest
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_rf = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_rf

In [21]:
# using decision tree
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_dt = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_dt

In [38]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train_data, y_train)
y_pred = model.predict(X_test_data)
predicted_summary = predict_summary(y_pred)
score_using_nb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_nb 

ValueError: ignored

In [22]:
# using xgboost
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_xgb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_xgb

In [23]:
# using gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_gb = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_gb

In [24]:
#  using ada boost
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_ab = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_ab

In [25]:
# using knn
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train_data_rfe, y_train)
y_pred = model.predict(X_test_data_rfe)
predicted_summary = predict_summary(y_pred)
score_using_knn = get_score_between_pred_and_act_summary(predicted_summary)
# score_using_knn

## Save the output scores in csv file

In [26]:
import inspect


def retrieve_name(var):
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

In [34]:
# Save the avverage score of each model

import pandas as pd
df = pd.DataFrame(columns=['rouge-1 r', 'rouge-1 p', 'rouge-1 f', 'rouge-2 r', 'rouge-2 p', 'rouge-2 f', 'rouge-l r', 'rouge-l p', 'rouge-l f'])

# for each model
models = [score_using_lr, score_using_svm, score_using_rf, score_using_dt, score_using_xgb, score_using_knn, score_using_ab]
for i in range(len(models)):
    # Calculate the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    rouge_1_r, rouge_1_p, rouge_1_f = 0, 0, 0
    rouge_2_r, rouge_2_p, rouge_2_f = 0, 0, 0
    rouge_l_r, rouge_l_p, rouge_l_f = 0, 0, 0
    for article_num in models[i]:
        rouge_1_r += models[i][article_num][0]['r']
        rouge_1_p += models[i][article_num][0]['p']
        rouge_1_f += models[i][article_num][0]['f']
        rouge_2_r += models[i][article_num][1]['r']
        rouge_2_p += models[i][article_num][1]['p']
        rouge_2_f += models[i][article_num][1]['f']
        rouge_l_r += models[i][article_num][2]['r']
        rouge_l_p += models[i][article_num][2]['p']
        rouge_l_f += models[i][article_num][2]['f']
    
    rouge_1_r /= len(models[i])
    rouge_1_p /= len(models[i])
    rouge_1_f /= len(models[i])
    rouge_2_r /= len(models[i])
    rouge_2_p /= len(models[i])
    rouge_2_f /= len(models[i])
    rouge_l_r /= len(models[i])
    rouge_l_p /= len(models[i])
    rouge_l_f /= len(models[i])

    # Save the average value of [rouge-1][r] [rouge-1][p] [rouge-1][f] [rouge-2][r] [rouge-2][p] [rouge-2][f] [rouge-l][r] [rouge-l][p] [rouge-l][f]
    print(retrieve_name(models[i]))
    df.loc[retrieve_name(models[i]).split('_')[-1]] = [rouge_1_r, rouge_1_p, rouge_1_f, rouge_2_r, rouge_2_p, rouge_2_f, rouge_l_r, rouge_l_p, rouge_l_f]

score_using_lr
score_using_svm
score_using_rf
score_using_dt
score_using_xgb
score_using_knn
score_using_ab


In [35]:
df

Unnamed: 0,rouge-1 r,rouge-1 p,rouge-1 f,rouge-2 r,rouge-2 p,rouge-2 f,rouge-l r,rouge-l p,rouge-l f
lr,0.593138,0.252677,0.346261,0.315291,0.116032,0.165095,0.55182,0.235354,0.322338
svm,0.63443,0.243514,0.344726,0.343667,0.112138,0.165025,0.592572,0.227823,0.322326
rf,0.629381,0.251606,0.351215,0.34286,0.116887,0.16983,0.587878,0.23485,0.327913
dt,0.597523,0.251941,0.345981,0.315306,0.114769,0.163559,0.556215,0.234729,0.322244
xgb,0.626301,0.245528,0.345192,0.336438,0.112694,0.164649,0.585281,0.229884,0.32297
knn,0.623811,0.248753,0.348434,0.340021,0.115749,0.168492,0.582328,0.23236,0.325374
ab,0.58179,0.257106,0.347183,0.30508,0.115867,0.163099,0.540089,0.238855,0.322465


In [36]:
df.transpose()

Unnamed: 0,lr,svm,rf,dt,xgb,knn,ab
rouge-1 r,0.593138,0.63443,0.629381,0.597523,0.626301,0.623811,0.58179
rouge-1 p,0.252677,0.243514,0.251606,0.251941,0.245528,0.248753,0.257106
rouge-1 f,0.346261,0.344726,0.351215,0.345981,0.345192,0.348434,0.347183
rouge-2 r,0.315291,0.343667,0.34286,0.315306,0.336438,0.340021,0.30508
rouge-2 p,0.116032,0.112138,0.116887,0.114769,0.112694,0.115749,0.115867
rouge-2 f,0.165095,0.165025,0.16983,0.163559,0.164649,0.168492,0.163099
rouge-l r,0.55182,0.592572,0.587878,0.556215,0.585281,0.582328,0.540089
rouge-l p,0.235354,0.227823,0.23485,0.234729,0.229884,0.23236,0.238855
rouge-l f,0.322338,0.322326,0.327913,0.322244,0.32297,0.325374,0.322465


In [37]:
df.transpose().to_csv('output/average_rouge_score_with_rfe.csv')