In [0]:
import pandas as pd
import os
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [0]:
!ls 'gdrive/My Drive/TennisData'

In [0]:
FILES_DIR = '/content/gdrive/My Drive/TennisData'

In [0]:
sample_data = [
    '2014-01 Auckland: Isner - Lu',
    '2014-01, Auckland, New Zealand, ATP, singles: Isner - Lu',
    '2014-01, Auckland, New Zealand, ATP, singles: Lu - Ferrer',
    '2014-01 Auckland: Lu - Johnson',
    '2014-01, Auckland, New Zealand, ATP, singles: Lu - Johnson',
    '2014-01 Auckland: Lu - Ferrer'
]

In [0]:
sample_df = pd.DataFrame({'summary': sample_data})

In [0]:
sample_df['summary'] = sample_df['summary'].str.lower()

In [0]:
sample_df['summary'] = sample_df['summary'].str.replace('new zealand', 'new_zealand')

In [9]:
sample_df

Unnamed: 0,summary
0,2014-01 auckland: isner - lu
1,"2014-01, auckland, new_zealand, atp, singles: ..."
2,"2014-01, auckland, new_zealand, atp, singles: ..."
3,2014-01 auckland: lu - johnson
4,"2014-01, auckland, new_zealand, atp, singles: ..."
5,2014-01 auckland: lu - ferrer


In [0]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=1, max_df=1.0,
    token_pattern=ur'(?u)\b[a-zA-Zа-яА-ЯёЁ0-9/_\-]+\b',
    stop_words=['atp', 'singles']
)

In [0]:
tfidf = tfidf_vectorizer.fit_transform(sample_df['summary'])

In [12]:
tfidf_vectorizer.vocabulary_.items()

[(u'isner', 3),
 (u'lu', 5),
 (u'ferrer', 2),
 (u'johnson', 4),
 (u'auckland', 1),
 (u'new_zealand', 6),
 (u'2014-01', 0)]

In [13]:
tfidf_vectorizer.idf_

array([1.        , 1.        , 1.84729786, 1.84729786, 1.84729786,
       1.        , 1.55961579])

In [14]:
tfidf.toarray()

array([[0.39489896, 0.39489896, 0.        , 0.72949601, 0.        ,
        0.39489896, 0.        ],
       [0.33624302, 0.33624302, 0.        , 0.621141  , 0.        ,
        0.33624302, 0.52440992],
       [0.33624302, 0.33624302, 0.621141  , 0.        , 0.        ,
        0.33624302, 0.52440992],
       [0.39489896, 0.39489896, 0.        , 0.        , 0.72949601,
        0.39489896, 0.        ],
       [0.33624302, 0.33624302, 0.        , 0.        , 0.621141  ,
        0.33624302, 0.52440992],
       [0.39489896, 0.39489896, 0.72949601, 0.        , 0.        ,
        0.39489896, 0.        ]])

In [0]:
cs = cosine_similarity(tfidf, tfidf)

In [16]:
cs

array([[1.        , 0.85146594, 0.39834605, 0.46783557, 0.39834605,
        0.46783557],
       [0.85146594, 1.        , 0.61418385, 0.39834605, 0.61418385,
        0.39834605],
       [0.39834605, 0.61418385, 1.        , 0.39834605, 0.61418385,
        0.85146594],
       [0.46783557, 0.39834605, 0.39834605, 1.        , 0.85146594,
        0.46783557],
       [0.39834605, 0.61418385, 0.61418385, 0.85146594, 1.        ,
        0.39834605],
       [0.46783557, 0.39834605, 0.85146594, 0.46783557, 0.39834605,
        1.        ]])

In [17]:
for row in sample_df['summary'].iteritems():
  sent_idx = row[0]
  best_sim_idx = numpy.flip(cs[sent_idx].argsort(), 0)
  
  print(row[1])
  for best_sim_row_idx in best_sim_idx:
    if best_sim_row_idx == sent_idx:
      continue
    print('--> {} = similarity: {}'.format(sample_df['summary'].iloc[best_sim_row_idx], cs[sent_idx][best_sim_row_idx]))

2014-01 auckland: isner - lu
--> 2014-01, auckland, new_zealand, atp, singles: isner - lu = similarity: 0.851465936447
--> 2014-01 auckland: lu - ferrer = similarity: 0.467835572611
--> 2014-01 auckland: lu - johnson = similarity: 0.467835572611
--> 2014-01, auckland, new_zealand, atp, singles: lu - johnson = similarity: 0.398346053936
--> 2014-01, auckland, new_zealand, atp, singles: lu - ferrer = similarity: 0.398346053936
2014-01, auckland, new_zealand, atp, singles: isner - lu
--> 2014-01 auckland: isner - lu = similarity: 0.851465936447
--> 2014-01, auckland, new_zealand, atp, singles: lu - johnson = similarity: 0.614183854915
--> 2014-01, auckland, new_zealand, atp, singles: lu - ferrer = similarity: 0.614183854915
--> 2014-01 auckland: lu - ferrer = similarity: 0.398346053936
--> 2014-01 auckland: lu - johnson = similarity: 0.398346053936
2014-01, auckland, new_zealand, atp, singles: lu - ferrer
--> 2014-01 auckland: lu - ferrer = similarity: 0.851465936447
--> 2014-01, auckland

In [0]:
with open(os.path.join(FILES_DIR, u'atp-single-2010-2018-words.txt'), 'w') as f:
  for word in tfidf_vectorizer.get_feature_names():
    f.write('{}\n'.format(word))