<a href="https://colab.research.google.com/github/imyanzhen/neon/blob/master/5c0dc9f0_1072_466c_b702_a7b756426cc7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import time
import nltk
import warnings
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
warnings.filterwarnings("ignore")
%load_ext google.colab.data_table

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


In [0]:
df = pd.read_csv('/content/drive/My Drive/Chen/output1.csv', header=None, names=['text'])
df.head()

Unnamed: 0,text
0,Madame Secretary:\n\nThank you for reaching ou...
1,"Cheryl, Jake,\n\nI received a call from Masood..."
2,We anticipate the release of what are claimed ...
3,Spoke to Ed Levine today to follow up on Frida...
4,Purely to update: Tom had me in for lunch at t...


In [0]:
df.shape

(4800, 1)

In [0]:
def tokenize(x):
  return nltk.sent_tokenize(x)

def sentenize(temp, col = 'text', reset_index=False):
  s = temp.apply(lambda x: pd.Series(x[col]),axis=1).stack().reset_index(level=1, drop=True)
  s.name = col
  temp = temp.drop(col, axis=1).join(s)
  if reset_index:
    temp.reset_index(inplace=True)
  return temp

def lsa_process(df):
  temp = df.copy()

  temp.loc[:,'text'] = temp.text.apply(lambda x: x.lower())

  temp.loc[:,'text'] = temp['text'].apply(tokenize)
  temp = sentenize(temp,'text',True)
  temp.columns = ['para_id','text']

  temp.loc[:,'text'] = temp['text'].apply(lambda x: x.split(':'))
  temp = sentenize(temp,'text')

  temp.loc[:,'text'] = temp['text'].str.replace("[^a-zA-Z0-9]", " ")

  temp.loc[:,'text'] = temp['text'].dropna()

  temp = temp[temp['text'].str.split().str.len().gt(3)]

  temp = temp.reset_index(drop=True)

  vectorizer = TfidfVectorizer(max_df=0.9, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)
  
  dtm = vectorizer.fit_transform(temp['text'])

  # doc_term_matrix = X_train_tfidf.todense()
  # pd.DataFrame(doc_term_matrix, columns=vectorizer.get_feature_names(), \
  # index=train_text).head()

  lsa = TruncatedSVD(5)
  dtm_lsa = make_pipeline(lsa, MinMaxScaler(copy=False),
                          Normalizer(copy=False),verbose=True)
  dtm_lsa = dtm_lsa.fit_transform(dtm)

  # pd.DataFrame(svd.components_, columns = vectorizer.get_feature_names())

  # lsa_df = pd.DataFrame(X_train_lsa, index=train_text)
  lsa_df = pd.DataFrame(dtm_lsa)
  lsa_df.columns = ['dim' + str(col) for col in lsa_df.columns]

  for i in range(len(lsa_df)-1):
    temp.loc[i,'cosine'] = cosine(lsa_df.iloc[i].values,lsa_df.iloc[i+1].values)

  temp['sent_id'] = temp.groupby(['para_id']).cumcount()+1
  temp.loc[temp.groupby('para_id')['cosine'].tail(1).index, 'cosine'] = np.nan
  temp = temp[['para_id','sent_id','text','cosine']]

  final_df = temp.join(lsa_df).rename_axis('sent_gid').reset_index()

  xx = final_df.groupby('para_id').agg({'sent_gid':['count'], 
                                        'cosine':['mean','median','max',
                                                  'min','std']})[['sent_gid',
                                                                  'cosine']]
  xx.columns = ["_".join(x) for x in xx.columns.ravel()]
  xx['sent_begin'] = final_df.groupby('para_id').head(1).sent_gid.values
  xx['sent_end'] = final_df.groupby('para_id').tail(1).sent_gid.values
  outdf1 = xx.reset_index()[['para_id','sent_begin','sent_end',
                             'sent_gid_count','cosine_mean',
                            'cosine_median','cosine_max',
                             'cosine_min','cosine_std']]
  outdf1.columns = ['paragraph_id','sentence_begin','sentence_end',
                    'sentence_count','cosine_mean',
                'cosine_median','cosine_max','cosine_min','cosine_std']

  outdf2 = final_df[['sent_gid','text','dim0','dim1','dim2','dim3','dim4']]
  outdf2.columns = [['sentence_id','sentence','dimension1','dimension2',
                     'dimension3','dimension4','dimension5']]

  outdf3 = final_df[['sent_gid','cosine']]
  outdf3.columns = ['sentence_id1','cosine']
  outdf3['sentence_id2'] = outdf3['sentence_id1']+1
  outdf3.dropna(inplace=True)
  outdf3 = outdf3[['sentence_id1','sentence_id2','cosine']]

  return outdf1,outdf2,outdf3

In [0]:
out1,out2,out3 = lsa_process(df[:1200])
out1.to_csv('out1_part1_delv1.csv')
out2.to_csv('out1_part1_delv2.csv')
out3.to_csv('out1_part1_delv3.csv')

[Pipeline] ...... (step 1 of 3) Processing truncatedsvd, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing normalizer, total=   0.0s


In [0]:
out1,out2,out3 = lsa_process(df[1200:2400])
out1.to_csv('out1_part2_delv1.csv')
out2.to_csv('out1_part2_delv2.csv')
out3.to_csv('out1_part2_delv3.csv')

[Pipeline] ...... (step 1 of 3) Processing truncatedsvd, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing normalizer, total=   0.0s


In [0]:
out1,out2,out3 = lsa_process(df[2400:3600])
out1.to_csv('out1_part3_delv1.csv')
out2.to_csv('out1_part3_delv2.csv')
out3.to_csv('out1_part3_delv3.csv')

[Pipeline] ...... (step 1 of 3) Processing truncatedsvd, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing normalizer, total=   0.0s


In [0]:
out1,out2,out3 = lsa_process(df[3600:])
out1.to_csv('out1_part4_delv1.csv')
out2.to_csv('out1_part4_delv2.csv')
out3.to_csv('out1_part4_delv3.csv')

[Pipeline] ...... (step 1 of 3) Processing truncatedsvd, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing normalizer, total=   0.0s


In [0]:
df2 = pd.read_csv('/content/drive/My Drive/Chen/output2.csv', 
                 header=None, sep='\t',names=['text'])
out1,out2,out3 = lsa_process(df2)
out1.to_csv('out2_delv1.csv')
out2.to_csv('out2_delv2.csv')
out3.to_csv('out2_delv3.csv')

[Pipeline] ...... (step 1 of 3) Processing truncatedsvd, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing normalizer, total=   0.0s


In [0]:
!cp /content/*.csv '/content/drive/My Drive/Chen/'