## Content-based recommender for kickstarter projects

In [1]:
# run this statement only once to install Rake
!pip install rake_nltk



In [2]:
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import re, nltk, gensim
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
from nltk.corpus import stopwords
from string import punctuation


### Step 1: Read in and analyse the data

In [4]:
import pandas as pd
import glob

path = r'./data/' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)



# df = pd.read_csv('Kickstarter057.csv')

df = frame
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Final Main
0,27,A student design build studio blurring the bou...,"{""id"":25,""name"":""Sculpture"",""slug"":""art/sculpt...",1369,US,the United States,1334085321,"{""id"":1709491227,""name"":""Kevin Taylor"",""is_reg...",USD,$,...,1335762019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1369.0,international,"{""id"":25","name:""Sculpture""","slug:""art/sculpture""",art/sculpture,art
1,54,A patch commemorating the SN8 flight from Boca...,"{""id"":1,""name"":""Art"",""slug"":""art"",""position"":1...",1229,US,the United States,1602983631,"{""id"":1475027416,""name"":""Liem Bahneman"",""slug""...",USD,$,...,1608170400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1229.0,international,"{""id"":1","name:""Art""","slug:""art""",art,art
2,63,We are creating TRM movie posters for distribu...,"{""id"":21,""name"":""Digital Art"",""slug"":""art/digi...",1637,US,the United States,1341415158,"{""id"":80857009,""name"":""Erik van Ingen"",""is_reg...",USD,$,...,1342238341,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1637.5,international,"{""id"":21","name:""Digital Art""","slug:""art/digital art""",art/digital art,art
3,26,A video performance piece for the self-designe...,"{""id"":24,""name"":""Performance Art"",""slug"":""art/...",2750,US,the United States,1307480670,"{""id"":744741310,""name"":""Melissa Basaran (delet...",USD,$,...,1310767233,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2750.0,international,"{""id"":24","name:""Performance Art""","slug:""art/performance art""",art/performance art,art
4,2,Providing young artists with creative sponsors...,"{""id"":288,""name"":""Installations"",""slug"":""art/i...",75,US,the United States,1519066595,"{""id"":1773775377,""name"":""Chelsey Everest Eiel""...",USD,$,...,1522253573,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",75.0,international,"{""id"":288","name:""Installations""","slug:""art/installations""",art/installations,art


In [5]:
def extract_cat(text):
    text = text.split(",")
    
    text = text[1] + " " +text[2]
    

    text = text.replace ("/", " ")
    
    
    text = text.replace ("name", "")
    text = text.replace ("slug", "")
    
    text = text.replace ('"', "")
    text = text.replace ('{', "")
    text = text.replace (':', "")
    
    text = text.lower()
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text) # matches all whitespace characters
    text = text.strip(' ')
    return text



df['category'] = df['category'].apply(lambda x: extract_cat(x))


df.head()



Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Final Main
0,27,A student design build studio blurring the bou...,sculpture art sculpture,1369,US,the United States,1334085321,"{""id"":1709491227,""name"":""Kevin Taylor"",""is_reg...",USD,$,...,1335762019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1369.0,international,"{""id"":25","name:""Sculpture""","slug:""art/sculpture""",art/sculpture,art
1,54,A patch commemorating the SN8 flight from Boca...,art art,1229,US,the United States,1602983631,"{""id"":1475027416,""name"":""Liem Bahneman"",""slug""...",USD,$,...,1608170400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1229.0,international,"{""id"":1","name:""Art""","slug:""art""",art,art
2,63,We are creating TRM movie posters for distribu...,digital art art digital art,1637,US,the United States,1341415158,"{""id"":80857009,""name"":""Erik van Ingen"",""is_reg...",USD,$,...,1342238341,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1637.5,international,"{""id"":21","name:""Digital Art""","slug:""art/digital art""",art/digital art,art
3,26,A video performance piece for the self-designe...,performance art art performance art,2750,US,the United States,1307480670,"{""id"":744741310,""name"":""Melissa Basaran (delet...",USD,$,...,1310767233,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2750.0,international,"{""id"":24","name:""Performance Art""","slug:""art/performance art""",art/performance art,art
4,2,Providing young artists with creative sponsors...,installations art installations,75,US,the United States,1519066595,"{""id"":1773775377,""name"":""Chelsey Everest Eiel""...",USD,$,...,1522253573,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",75.0,international,"{""id"":288","name:""Installations""","slug:""art/installations""",art/installations,art


In [6]:

df = df[['name','category','blurb']]
df.head()


Unnamed: 0,name,category,blurb
0,Bird Wall,sculpture art sculpture,A student design build studio blurring the bou...
1,SpaceX SN8 Starship Prototype 12km Hop Patch,art art,A patch commemorating the SN8 flight from Boca...
2,"""The Real Maine"" movie poster",digital art art digital art,We are creating TRM movie posters for distribu...
3,Voicing Islam Video Performance Piece,performance art art performance art,A video performance piece for the self-designe...
4,The Daughters of Revolution Project,installations art installations,Providing young artists with creative sponsors...


In [7]:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df['blurb']=df['blurb'].map(lambda s:preprocess(s)) 
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,category,blurb
0,Bird Wall,sculpture art sculpture,student design build studio blurring boundarie...
1,SpaceX SN8 Starship Prototype 12km Hop Patch,art art,patch commemorating flight boca chica
2,"""The Real Maine"" movie poster",digital art art digital art,creating trm movie posters distribution opport...
3,Voicing Islam Video Performance Piece,performance art art performance art,video performance piece self designed voicing ...
4,The Daughters of Revolution Project,installations art installations,providing young artists creative sponsors part...


In [8]:
dfcat = df['category']

dfcat.head 

num_dfcat = len(dfcat)

#print(num_dfcat)

from nltk.tokenize import word_tokenize

dfcat = dfcat.apply(word_tokenize)
dfcat.head()


0                  [sculpture, art, sculpture]
1                                   [art, art]
2            [digital, art, art, digital, art]
3    [performance, art, art, performance, art]
4          [installations, art, installations]
Name: category, dtype: object

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(dfcat)

print(dictionary)

Dictionary(96 unique tokens: ['art', 'sculpture', 'digital', 'performance', 'installations']...)


In [10]:
# Convert all documents to TF Vectors
all_tf_vectors = [dictionary.doc2bow(doc) for doc in dfcat]

In [11]:
#Label the trained data.

all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]

#print(type(all_data_as_dict))
#print(all_data_as_dict)

all_arts = [(d, 'ARTS') for d in all_data_as_dict[0:12]]
all_paintings = [(d, 'PAINTINGS') for d in all_data_as_dict[13:20]]

In [12]:

#Generate the trained classifier
classifier = nltk.NaiveBayesClassifier.train(all_data)

NameError: name 'all_data' is not defined

In [13]:
test_doc = all_data_as_dict[0] #35 is crime article. Until 99 are all crime
print(classifier.classify(test_doc))

NameError: name 'classifier' is not defined