In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from src.item_recommender import ItemRecommender

import re

## Basic

### Part 1: Build a recommender

Load the pickle file and set the index to the title, create a bag of words representataion and get predictions

In [2]:
df = pd.read_pickle('data/movie_data.pickle')
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[common, decency, finding, solace, number, two..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[reluctant, son, aging, patriarch, clandestine..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[grip, portrayed, career, tightens, son, vito,..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[ability, dark, knight, must, accept, one, fig..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[evidence, colleagues, forcing, reconsider, ju..."


In [3]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[common, decency, finding, solace, number, two..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[reluctant, son, aging, patriarch, clandestine..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[grip, portrayed, career, tightens, son, vito,..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[ability, dark, knight, must, accept, one, fig..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[evidence, colleagues, forcing, reconsider, ju..."


In [4]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words += ' '.join(row[col])+ ' '
        else:
            words += row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [5]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
The Godfather,crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
The Dark Knight,action crime drama christophernolan christia...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...


In [6]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)


#### Create a instance of my recommender class and get predictions

In [7]:
rec = ItemRecommender()
count_df = pd.DataFrame(count_matrix.todense(), index=indices.values)

In [8]:
rec.fit(count_df)

In [9]:
print(rec.get_recommendations('Fargo'))

['No Country for Old Men' 'The Departed' 'Rope' 'The Godfather'
 'Reservoir Dogs']


### Q 4)
Fill in code for the `create_user_profile` method.

In [10]:
profile = rec.get_user_profile(['The Godfather','The Godfather: Part II'])

In [11]:
profile

array([0., 0., 0., ..., 0., 0., 0.])

There is not much to see as this is a large matrix however I know one index 584 has values in it so we can look there.

In [12]:
profile[584]

4.0

In [13]:
count.get_feature_names()[584]

'crime'

It looks like the word crime shows up a lot in the 2 movies involved with this profile.   
    
    
### Q 5)

Fill in the `get_user_recommendation` and test it.

In [14]:
print(rec.get_user_recommendation(['The Godfather','The Godfather: Part II']))

['Goodfellas' 'Rope' 'Cool Hand Luke' 'Scarface' 'Fargo']


## Advanced

### Part 2: Recommend articles 

In [17]:
news = pd.read_csv('data/articles.csv')

In [18]:
news.head()

Unnamed: 0.1,Unnamed: 0,document_type,web_url,lead_paragraph,abstract,snippet,news_desk,word_count,source,section_name,subsection_name,_id,pub_date,print_page,headline,content,content_and_title
0,6,article,http://www.nytimes.com/2013/10/03/sports/socce...,Defending champion Bayern Munich produced a po...,"Bayern Munich beats Manchester City, 3-1, in U...",Defending champion Bayern Munich produced a po...,Sports,190,The New York Times,Sports,Soccer,524ce87638f0d8198973ff59,2013-10-03T00:00:00Z,14,Bayern Munich Dominates Manchester City,defending champion bayern munich produced powe...,defending champion bayern munich produced powe...
1,8,article,http://www.nytimes.com/2013/10/03/sports/hocke...,The Devils announced that Cory Schneider would...,New Jersey Devils announce that Cory Schneider...,The Devils announced that Cory Schneider would...,Sports,110,The New York Times,Sports,Hockey,524ceb0338f0d8198973ff5c,2013-10-03T00:00:00Z,14,Brodeur’s Starting Streak to End,the devil announced cory schneider would start...,the devil announced cory schneider would start...
2,12,article,http://www.nytimes.com/2013/10/03/business/ene...,"Clean Energy Fuels is selling Redeem, a vehicl...","Clean Energy Fuels is selling Redeem, vehicle ...","Clean Energy Fuels is selling Redeem, a vehicl...",Business,743,The New York Times,Business Day,Energy & Environment,524ccb1738f0d8198973ff11,2013-10-03T00:00:00Z,9,Fuel From Landfill Methane Goes on Sale,farmer waste management company energy industr...,farmer waste management company energy industr...
3,13,article,http://www.nytimes.com/2013/10/03/world/africa...,"With her fashion shows, Akuja de Garang is try...",Juba Journal; fashion show organizer Akuja de ...,"With her fashion shows, Akuja de Garang is try...",Foreign,1234,The New York Times,World,Africa,524cc70538f0d8198973ff0a,2013-10-03T00:00:00Z,8,"On Fashion Runway, South Sudan Takes Steps Tow...",juba south sudan even standard fashion model w...,juba south sudan even standard fashion model w...
4,20,article,http://www.nytimes.com/2013/10/03/world/middle...,President Hassan Rouhani of Iran dismissed the...,Iran's Pres Hassan Rouhani dismisses verbal as...,President Hassan Rouhani of Iran dismissed the...,Foreign,164,The New York Times,World,Middle East,524cc1f038f0d8198973fef2,2013-10-03T00:00:00Z,9,Iran’s President Responds to Netanyahu,president hassan rouhani iran wednesday dismis...,president hassan rouhani iran wednesday dismis...


In [19]:
news.headline

0                Bayern Munich Dominates Manchester City
1                       Brodeur’s Starting Streak to End
2                Fuel From Landfill Methane Goes on Sale
3      On Fashion Runway, South Sudan Takes Steps Tow...
4                 Iran’s President Responds to Netanyahu
                             ...                        
695    Defining and Demanding a Musician’s Fair Shake...
696    Britain Plans to Require Community Service for...
697    Old Atrocities, Now Official, Galvanize Afghan...
698    Discussing Iran, Obama and Netanyahu Display U...
699              Government Shuts Down in Budget Impasse
Name: headline, Length: 700, dtype: object

In [20]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
vectorized_df = vectorizer.fit_transform(news.content)

In [21]:
news_recommender = ItemRecommender()
news_recommender.fit(vectorized_df, titles = news.headline)

In [24]:
print(news_recommender.get_recommendations('Bayern Munich Dominates Manchester City'))

['Özil Scores as Arsenal Blanks Napoli'
 'Neymar Scores as Barcelona Stays Perfect in League Play'
 'Manchester Teams Stumble and Fall' 'Devils Defeat Flyers'
 'City Humbles United in Manchester Derby']


## Extra Credit

### Part 3: Improving the recommender

In [25]:
news['content_and_title'] = news.content + news.headline
news.head(3)

Unnamed: 0.1,Unnamed: 0,document_type,web_url,lead_paragraph,abstract,snippet,news_desk,word_count,source,section_name,subsection_name,_id,pub_date,print_page,headline,content,content_and_title
0,6,article,http://www.nytimes.com/2013/10/03/sports/socce...,Defending champion Bayern Munich produced a po...,"Bayern Munich beats Manchester City, 3-1, in U...",Defending champion Bayern Munich produced a po...,Sports,190,The New York Times,Sports,Soccer,524ce87638f0d8198973ff59,2013-10-03T00:00:00Z,14,Bayern Munich Dominates Manchester City,defending champion bayern munich produced powe...,defending champion bayern munich produced powe...
1,8,article,http://www.nytimes.com/2013/10/03/sports/hocke...,The Devils announced that Cory Schneider would...,New Jersey Devils announce that Cory Schneider...,The Devils announced that Cory Schneider would...,Sports,110,The New York Times,Sports,Hockey,524ceb0338f0d8198973ff5c,2013-10-03T00:00:00Z,14,Brodeur’s Starting Streak to End,the devil announced cory schneider would start...,the devil announced cory schneider would start...
2,12,article,http://www.nytimes.com/2013/10/03/business/ene...,"Clean Energy Fuels is selling Redeem, a vehicl...","Clean Energy Fuels is selling Redeem, vehicle ...","Clean Energy Fuels is selling Redeem, a vehicl...",Business,743,The New York Times,Business Day,Energy & Environment,524ccb1738f0d8198973ff11,2013-10-03T00:00:00Z,9,Fuel From Landfill Methane Goes on Sale,farmer waste management company energy industr...,farmer waste management company energy industr...


In [26]:
vectorizer2 = TfidfVectorizer(stop_words='english', max_features=10000)
vectorizer2.fit(news.content_and_title)

TfidfVectorizer(max_features=10000, stop_words='english')

In [27]:
content_matrix = vectorizer2.transform(news.content)
headline_matrix = vectorizer2.transform(news.headline)

In [28]:
def weighted_vector(content_matrix, headline_matrix, alpha):
    return (content_matrix * alpha) + (headline_matrix * (1-alpha))

In [29]:
weighted_vector = weighted_vector(content_matrix, headline_matrix, .9)

In [31]:
weighted_recommender = ItemRecommender()
weighted_recommender.fit(weighted_vector, titles = news.headline)
print(weighted_recommender.get_recommendations('Bayern Munich Dominates Manchester City'))

['Özil Scores as Arsenal Blanks Napoli'
 'Manchester Teams Stumble and Fall'
 'Neymar Scores as Barcelona Stays Perfect in League Play'
 'City Humbles United in Manchester Derby' 'Manchester United Wins']
