In [None]:
import pandas as pd
import re

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv('../data/lobsters_full_2017_cleaned.csv')

In [None]:
df.head()

In [None]:
df = df.set_index('Unnamed: 0')

### Investigating the about column

In [None]:
df.about.head()

In [None]:
sum(df.about.isnull()) / df.about.shape[0]

In [None]:
df.about.iloc[0]

In [None]:
print(df.about.iloc[0])

In [None]:
sample = df.sample(5)

for idx, row in sample.iterrows():
    print('Username: {}'.format(row.username))
    print(row.about)
    print()

In [None]:
%matplotlib inline
df.about.map(lambda x: len(x)).hist()

In [None]:
import numpy as np
type(np.nan)

In [None]:
df.about.map(lambda x: len(x) if isinstance(x, str) else 0).hist()

In [None]:
df['about_length'] = df.about.map(
    lambda x: len(x) if isinstance(x, str) else 0)

In [None]:
df[df['about_length'] > 1000].sample(5).about

In [None]:
user_df = df[['username', 'about', 'about_length']]

In [None]:
user_df.columns

In [None]:
user_df = user_df.drop_duplicates()

In [None]:
user_df[user_df['about_length'] > 1000]

In [None]:
user_df.about_length.hist(bins=100)

In [None]:
user_df.about_length.mean()

In [None]:
sum(user_df['about_length'] == 0) / user_df.shape[0]

### Preprocessing our data

In [None]:
about_df = user_df[user_df.about_length > 0]

In [None]:
example_text = about_df.iloc[0].about

In [None]:
example_text

In [None]:
sentences = sent_tokenize(example_text)

In [None]:
sentences[0]

In [None]:
words = word_tokenize(sentences[0])

In [None]:
words

## To determine

- what words do I want to keep?
- Is punctuation important or not?
- Are digits or symbols important?
- Are websites important?

In [None]:
import itertools

all_tags = set(itertools.chain(*df.tags.values.ravel()))
english_stopwords = stopwords.words('english')

def clean_text(sentence):
    words = re.findall("\w+", sentence.lower())
    return [word for word in words if
            word not in english_stopwords
            and (len(word) > 1 or word in all_tags)
           ]

clean_text(sentences[0])


### Regex to find urls?

- Useful site for testing Regex: [https://regex101.com/](https://regex101.com/)

In [None]:
def remove_urls(text):
    return re.sub('http\S+', '', text)

In [None]:
remove_urls('This is a test: https://test.org. Does it work?')

In [None]:
clean_text(remove_urls(sentences[0]))

### Stemming words

- Usefulness depends on language and your problem

In [None]:
stemmer = PorterStemmer()
[stemmer.stem(w) for w in clean_text(remove_urls(sentences[0]))]

In [None]:
def tokenize(text):
    return [stemmer.stem(w) for w in 
            clean_text(remove_urls(text.lower()))]

In [None]:
for sentence in sentences:
    print(tokenize(sentence))

### TF-IDF with cleaned text

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, 
                             max_df=.9, min_df=5)
tfidf_matrix = vectorizer.fit_transform(about_df['about'])

In [None]:
tfidf_matrix[0, :100].toarray()

In [None]:
vectorizer.get_feature_names()[:5]

In [None]:
len(vectorizer.get_feature_names())

In [None]:
first_about = tfidf_matrix[0,:]
first_about.indices

In [None]:
first_about.toarray()

In [None]:
first_about.toarray().flatten()[367]

In [None]:
for index in first_about.indices:
    feature_name = vectorizer.get_feature_names()[index]
    print('{}: {}'.format(feature_name, 
                          first_about.toarray().flatten()[index]))


### Your Turn:
- Can you make a sorted list of the features instead? With the most important words up top?

In [None]:
# %load ../solutions/sorted_tfidf_of_about.py


In [None]:
sorted_tfidf_of_element(tfidf_matrix, 0, vectorizer)

### Finding similar documents using TF-IDF and cosine similarity

In [None]:
cosine_similarity?

In [None]:
tfidf_matrix[0:1]

In [None]:
cosine_similarities = cosine_similarity(
    tfidf_matrix[0:1], tfidf_matrix).flatten()

In [None]:
cosine_similarities

In [None]:
foo = np.array([1, 2, 5, 0])
foo.argsort()

In [None]:
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices

In [None]:
cosine_similarities[related_docs_indices]

In [None]:
about_df.iloc[563].about

In [None]:
about_df.iloc[1206].about

In [None]:
def get_related_documents(dataframe, row_id, 
                          tfidf_matrix, n=5):
    """
    Get related documents in a dataframe when given a row id and a
    TFIDF matrix created from the dataframe.
    
    Parameters
    ----------
        dataframe: pd.DataFrame
            Dataframe to use to find related documents
        
        row_id: int
            The row id as an integer. This is the index 
            of the row in terms of iloc, not the dataframe index.
        
        tfidf_matrix: np.ndarray
            TF-IDF matrix as made from a TF-IDF Vectorizer
            fit_transform with the given dataframe
    
    Returns
    -------
        dataframe: pd.DataFrame
            dataframe of similar documents and original document with 
            the similarity score as a new column.
    """
    cosine_similarities = cosine_similarity(
        tfidf_matrix[row_id:row_id+1], 
        tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-n:-1]
    
    related_df = dataframe.iloc[
        related_docs_indices, :].copy()
    related_df['similarity_score'] = pd.Series(
        cosine_similarities[related_docs_indices]).values
    
    return related_df

In [None]:
get_related_documents(about_df, 3, tfidf_matrix)

In [None]:
for index, row in get_related_documents(about_df, 3, tfidf_matrix).iterrows():
    print(row.similarity_score)
    print(row.about)
    print()

### Your Turn

- Can you find some other interesting connections?
- Can you try writing a function which shows the most important words for a document and returns them as a dataframe?