In [5]:
!jupyter nbconvert --to script contentFiltering.ipynb


[NbConvertApp] Converting notebook contentFiltering.ipynb to script
[NbConvertApp] Writing 1982 bytes to contentFiltering.py


In [54]:
import requests

# Flask API URL (assuming it's running locally)
url = 'http://172.200.211.84:5000/get_recommendations'

# Define the article title you want to get recommendations for
article_title = 9213260650272029784

# Prepare the parameters for the GET request (article title)
params = {'title': str(article_title)}

# Send a GET request to the Flask API
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    if 'recommendations' in data:
        print(f"Recommendations for '{article_title}':")
        for idx, rec in enumerate(data['recommendations'], start=1):
            print(f"{idx}. {rec}")
    else:
        print("No recommendations found.")
else:
    print(f"Error: {response.status_code}, {response.text}")


Recommendations for '9213260650272029784':
1. 1º Congresso Brasileiro de IoT - Embarcados
2. Inatel lança seu Smart Campus, um laboratório vivo para demonstração de projetos na área de IoT - Startupi
3. Governo define cronograma para plano nacional de Internet das Coisas
4. Seguradores serão mentores de start-ups selecionadas pela aceleradora da Accenture | Sonho Seguro
5. Como a Microsoft está trabalhando com o conceito de IoT - Startupi


In [38]:
import pandas as pd

articles = pd.read_csv('shared_articles.csv')

In [39]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

articles['cleaned_description'] = articles['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jakeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
tf_matrix = tfidf.fit_transform(articles['cleaned_description'])

In [41]:
from sklearn.metrics.pairwise import cosine_similarity


similarity_matrix = cosine_similarity(tf_matrix)


In [42]:
idx = articles[articles['title'] == 'Why Evernote Is Giving Up on Its Data Centers for Google Cloud'].contentId.iloc[0]


In [43]:
idx

-8964534883296838326

In [44]:
articles

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang,cleaned_description
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,work still earli first full public version eth...
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,work still earli first full public version eth...
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,alarm clock wake stream advertfre broadcast ch...
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,excit share googl data center tour youtub vide...
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en,ait group project blockchain market could valu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3117,1487946604,CONTENT SHARED,9213260650272029784,3609194402293569455,7144190892417579456,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,HTML,https://startupi.com.br/2017/02/liga-ventures-...,"Conheça a Liga IoT, plataforma de inovação abe...","A Liga Ventures, aceleradora de startups espec...",pt,liga ventur aceleradora de startup especializa...
3118,1487947067,CONTENT SHARED,-3295913657316686039,6960073744377754728,-8193630595542572738,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3...,GA,US,HTML,https://thenextweb.com/apps/2017/02/14/amazon-...,Amazon takes on Skype and GoToMeeting with its...,"Amazon has launched Chime, a video conferencin...",en,amazon launch chime video conferenc chat servi...
3119,1488223224,CONTENT SHARED,3618271604906293310,1908339160857512799,-183341653743161643,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0...,SP,BR,HTML,https://code.org/about/2016,Code.org 2016 Annual Report,"February 9, 2017 - We begin each year with a l...",en,februari begin year look k comput scienc lands...
3120,1488300719,CONTENT SHARED,6607431762270322325,-1393866732742189886,2367029511384577082,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,MG,BR,HTML,https://www.bloomberg.com/news/articles/2017-0...,JPMorgan Software Does in Seconds What Took La...,"At JPMorgan Chase & Co., a learning machine is...",en,jpmorgan chase co learn machin pars financi de...


In [45]:
def get_recommendations(title, articles_df, similarity_matrix, tfidf_matrix):
    title = articles_df[articles_df['contentId'] == title]['title'].iloc[0]
    idx = articles_df[articles_df['title'] == title].index[0]
    
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_articles = sim_scores[1:6]  
    
    article_indices = [i[0] for i in top_articles]
    
    return articles_df.iloc[article_indices]['title'].tolist()



In [46]:
article_title = articles['title'].iloc[3]
recommended_articles = get_recommendations(9213260650272029784, articles, similarity_matrix, tf_matrix)

print(f"Top 5 Recommendations for '{article_title}':")

for idx, rec in enumerate(recommended_articles, start=1):
    print(f"{idx}. {rec}")


Top 5 Recommendations for 'Google Data Center 360° Tour':
1. 1º Congresso Brasileiro de IoT - Embarcados
2. Inatel lança seu Smart Campus, um laboratório vivo para demonstração de projetos na área de IoT - Startupi
3. Governo define cronograma para plano nacional de Internet das Coisas
4. Seguradores serão mentores de start-ups selecionadas pela aceleradora da Accenture | Sonho Seguro
5. Como a Microsoft está trabalhando com o conceito de IoT - Startupi


In [47]:
import joblib

# Save similarity matrix and TF-IDF model
with open('content_filtering_model.sav', 'wb') as model_file:
    joblib.dump((similarity_matrix, tf_matrix, articles), model_file)

print("Model saved successfully!")


Model saved successfully!
