## Content-based recommender for kickstarter projects

In [1]:
# run this statement only once to install Rake
!pip install rake_nltk
!pip install nltk




In [2]:
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import re, nltk, gensim
nltk.download('wordnet')
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
from nltk.corpus import stopwords
from string import punctuation


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Raymond\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Step 1: Read in and analyse the data

In [3]:
import pandas as pd
import glob

path = r'./data/' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)



# df = pd.read_csv('Kickstarter057.csv')

df = frame
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type,Final Main
0,27,A student design build studio blurring the bou...,"{""id"":25,""name"":""Sculpture"",""slug"":""art/sculpt...",1369,US,the United States,1334085321,"{""id"":1709491227,""name"":""Kevin Taylor"",""is_reg...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1335762019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1369.0,international,art
1,54,A patch commemorating the SN8 flight from Boca...,"{""id"":1,""name"":""Art"",""slug"":""art"",""position"":1...",1229,US,the United States,1602983631,"{""id"":1475027416,""name"":""Liem Bahneman"",""slug""...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1608170400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1229.0,international,art
2,63,We are creating TRM movie posters for distribu...,"{""id"":21,""name"":""Digital Art"",""slug"":""art/digi...",1637,US,the United States,1341415158,"{""id"":80857009,""name"":""Erik van Ingen"",""is_reg...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1342238341,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1637.5,international,art
3,26,A video performance piece for the self-designe...,"{""id"":24,""name"":""Performance Art"",""slug"":""art/...",2750,US,the United States,1307480670,"{""id"":744741310,""name"":""Melissa Basaran (delet...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1310767233,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2750.0,international,art
4,2,Providing young artists with creative sponsors...,"{""id"":288,""name"":""Installations"",""slug"":""art/i...",75,US,the United States,1519066595,"{""id"":1773775377,""name"":""Chelsey Everest Eiel""...",USD,$,...,https://www.kickstarter.com/discover/categorie...,False,False,failed,1522253573,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",75.0,international,art


In [4]:
def extract_cat(text):
    text = text.split(",")
    
    text = text[2]
    

    text = text.replace ("/", " ")
    
    
    text = text.replace ("name", "")
    text = text.replace ("slug", "")
    
    text = text.replace ('"', "")
    text = text.replace ('{', "")
    text = text.replace (':', "")
    
    text = text.lower()
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text) # matches all whitespace characters
    text = text.strip(' ')
    return text



df['category'] = df['category'].apply(lambda x: extract_cat(x))


df.head()



Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type,Final Main
0,27,A student design build studio blurring the bou...,art sculpture,1369,US,the United States,1334085321,"{""id"":1709491227,""name"":""Kevin Taylor"",""is_reg...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1335762019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1369.0,international,art
1,54,A patch commemorating the SN8 flight from Boca...,art,1229,US,the United States,1602983631,"{""id"":1475027416,""name"":""Liem Bahneman"",""slug""...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1608170400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1229.0,international,art
2,63,We are creating TRM movie posters for distribu...,art digital art,1637,US,the United States,1341415158,"{""id"":80857009,""name"":""Erik van Ingen"",""is_reg...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1342238341,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1637.5,international,art
3,26,A video performance piece for the self-designe...,art performance art,2750,US,the United States,1307480670,"{""id"":744741310,""name"":""Melissa Basaran (delet...",USD,$,...,https://www.kickstarter.com/discover/categorie...,True,False,successful,1310767233,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2750.0,international,art
4,2,Providing young artists with creative sponsors...,art installations,75,US,the United States,1519066595,"{""id"":1773775377,""name"":""Chelsey Everest Eiel""...",USD,$,...,https://www.kickstarter.com/discover/categorie...,False,False,failed,1522253573,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",75.0,international,art


In [5]:

df = df[['name','category','blurb']]
df.head()


Unnamed: 0,name,category,blurb
0,Bird Wall,art sculpture,A student design build studio blurring the bou...
1,SpaceX SN8 Starship Prototype 12km Hop Patch,art,A patch commemorating the SN8 flight from Boca...
2,"""The Real Maine"" movie poster",art digital art,We are creating TRM movie posters for distribu...
3,Voicing Islam Video Performance Piece,art performance art,A video performance piece for the self-designe...
4,The Daughters of Revolution Project,art installations,Providing young artists with creative sponsors...


In [6]:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df['blurb']=df['blurb'].map(lambda s:preprocess(s)) 
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['blurb']=df['blurb'].map(lambda s:preprocess(s))


Unnamed: 0,name,category,blurb
0,Bird Wall,art sculpture,student design build studio blurring boundarie...
1,SpaceX SN8 Starship Prototype 12km Hop Patch,art,patch commemorating flight boca chica
2,"""The Real Maine"" movie poster",art digital art,creating trm movie posters distribution opport...
3,Voicing Islam Video Performance Piece,art performance art,video performance piece self designed voicing ...
4,The Daughters of Revolution Project,art installations,providing young artists creative sponsors part...


In [7]:
#Tokenize everything in the category
dfcat = df['category']

dfcat.head 

num_dfcat = len(dfcat)

#print(num_dfcat)

from nltk.tokenize import word_tokenize

dfcattok = dfcat.apply(word_tokenize)
dfcattok.head()

#for loop each one and collect the first value. This shows the main categories that we have derived from our own dataset.
maincat_list = []

for x in dfcattok:
    if x[0] not in maincat_list:
        maincat_list.append(x[0])
        
print(maincat_list)

['art', 'comics', 'film', 'music', 'photography', 'publishing', 'technology']


In [8]:
#Load dataset
import pandas as pd
import glob

#Dataset
from nltk.corpus import PlaintextCorpusReader

art = PlaintextCorpusReader('data/Train/Art', '.+\.txt')
tech = PlaintextCorpusReader('data/Train/Tech', '.+\.txt')
comics = PlaintextCorpusReader('data/Train/Comics', '.+\.txt')
film = PlaintextCorpusReader('data/Train/Film', '.+\.txt')
music = PlaintextCorpusReader('data/Train/Music', '.+\.txt')
photography = PlaintextCorpusReader('data/Train/Photography', '.+\.txt')
publishing = PlaintextCorpusReader('data/Train/Publishing', '.+\.txt')


art_docs1 = [art.words(fid) for fid in art.fileids()]
tech_docs1 = [tech.words(fid) for fid in tech.fileids()]
comics_docs1 = [comics.words(fid) for fid in comics.fileids()]
film_docs1 = [film.words(fid) for fid in film.fileids()]
music_docs1 = [music.words(fid) for fid in music.fileids()]
photography_docs1 = [photography.words(fid) for fid in photography.fileids()]
publishing_docs1 = [publishing.words(fid) for fid in publishing.fileids()]

print(art_docs1[0][0:20])
print(tech_docs1[0][0:20])
print(comics_docs1[0][0:20])
print(film_docs1[0][0:20])
print(music_docs1[0][0:20])
print(photography_docs1[0][0:20])
print(publishing_docs1[0][0:20])

['A', 'student', 'design', 'build', 'studio', 'blurring', 'the', 'boundaries', 'of', 'architecture', 'and', 'ceramics', '(', 'and', 'birds', ').', 'A', 'patch', 'commemorating', 'the']
['CLA', 'is', 'an', 'innovative', 'product', 'designed', 'to', 'provide', 'real', 'time', 'legal', 'resources', 'to', 'largely', 'underserved', 'individuals', '.', 'The', 'LightSpike', 'is']
['The', 'Marvel', 'Era', '1982', '-', '1996', ':', 'A', 'G', '.', 'I', '.', 'Joe', 'A', 'Real', 'American', 'Hero', 'Comic', 'Collecting', 'Guide']
['The', 'popular', 'drunken', 'cooking', 'show', 'Put', 'It', 'In', 'Your', 'Mouth', 'will', 'shoot', '12', 'new', 'episodes', 'in', 'Los', 'Angeles', 'CA', 'with']
['Noah', '(', '13', ')', 'has', 'been', 'studying', 'piano', 'and', 'composing', 'since', 'age', '5', '.', 'He', 'wants', 'to', 'create', 'his', 'first']
['Last', 'year', 'our', 'calendar', 'was', 'our', 'biggest', 'fundraiser', ',', 'celebrating', 'and', 'sharing', 'the', 'fantastic', 'dogs', 'who', 'have', '

In [9]:
###Basically preprocessing date from dataset

# Combine the categories of the corpus
all_docs1 = art_docs1 + tech_docs1 + comics_docs1 + film_docs1 + music_docs1 + photography_docs1 + publishing_docs1
num_art_docs = len(art_docs1)
num_2 = len(art_docs1) + len(tech_docs1)
num_3 = num_2 + len(comics_docs1)
num_4 = num_3 + len(film_docs1)
num_5 = num_4 + len(music_docs1)
num_6 = num_5 + len(photography_docs1)

#For verifying the whether the output in dictionary is correct
print(num_art_docs)
print (len(tech_docs1))
print (len(comics_docs1))
print (len(film_docs1))
print (len(music_docs1))
print (len(photography_docs1))

# Processsing for stopwords, alphabetic words, Stemming 

all_docs2 = [[w.lower() for w in doc] for doc in all_docs1]

import re
all_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in all_docs2]

from nltk.corpus import stopwords
stop_list = stopwords.words('english')
all_docs4 = [[w for w in doc if w not in stop_list] for doc in all_docs3]

from nltk.stem.porter import *
stemmer = PorterStemmer()
all_docs5 = [[stemmer.stem(w) for w in doc] for doc in all_docs4]

#Create dictionary
from gensim import corpora
dictionary = corpora.Dictionary(all_docs5)
print(dictionary)

# Convert all documents to TF Vectors
all_tf_vectors = [dictionary.doc2bow(doc) for doc in all_docs5]

#Label the trained data. Since the folder name is the label, I use the same labels.

all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]
print(type(all_data_as_dict))


#print(all_data_as_dict). The labels are generated by our own dataset and used here.
art_data = [(d, 'art') for d in all_data_as_dict[0:num_art_docs]] #First document to number of art documents, which is 4. Document 0-4
tech_data = [(d, 'tech') for d in all_data_as_dict[num_art_docs:num_2]]
comics_data = [(d, 'comics') for d in all_data_as_dict[num_2:num_3]]
film_data = [(d, 'film') for d in all_data_as_dict[num_3:num_4]]
music_data = [(d, 'music') for d in all_data_as_dict[num_4:num_5]]
photography_data = [(d, 'photography') for d in all_data_as_dict[num_5:num_6]]
publishing_data = [(d, 'publishing') for d in all_data_as_dict[num_6:]]

all_labeled_data = art_data + tech_data + comics_data + film_data + music_data + photography_data + publishing_data

#Generate the trained classifier
classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)

test_doc = all_data_as_dict[200]
#print(all_data_as_dict[0])
print(classifier.classify(test_doc))

100
100
100
100
100
100
Dictionary(11367 unique tokens: ['ancient', 'announc', 'architectur', 'art', 'artist']...)
<class 'list'>
comics


In [10]:
### Validate 
# Read the files in validate folder and preparing the validation corpus
art_validation = PlaintextCorpusReader('data/Validate/Art', '.+\.txt')
tech_validation = PlaintextCorpusReader('data/Validate/Tech', '.+\.txt')
comics_validation = PlaintextCorpusReader('data/Validate/Comics', '.+\.txt')
film_validation = PlaintextCorpusReader('data/Validate/Film', '.+\.txt')
music_validation = PlaintextCorpusReader('data/Validate/Music', '.+\.txt')
photography_validation = PlaintextCorpusReader('data/Validate/Photography', '.+\.txt')
publishing_validation = PlaintextCorpusReader('data/Validate/Publishing', '.+\.txt')


# Tokenization
art_valid_docs1 = [art_validation.words(fid) for fid in art_validation.fileids()]
tech_valid_docs1 = [tech_validation.words(fid) for fid in tech_validation.fileids()]
comics_valid_docs1 = [comics_validation.words(fid) for fid in comics_validation.fileids()]
film_valid_docs1 = [film_validation.words(fid) for fid in film_validation.fileids()]
music_valid_docs1 = [music_validation.words(fid) for fid in music_validation.fileids()]
photography_valid_docs1 = [photography_validation.words(fid) for fid in photography_validation.fileids()]
publishing_valid_docs1 = [publishing_validation.words(fid) for fid in publishing_validation.fileids()]


# Combine the two sets of documents for easy processing.
all_valid_docs = art_valid_docs1 + tech_valid_docs1 + comics_valid_docs1 + film_valid_docs1 + music_valid_docs1 + photography_valid_docs1 + publishing_valid_docs1


# This number will be used to separate the two sets of documents later.
num_art_valid_docs = len(art_valid_docs1)
num_valid_2 = num_art_valid_docs + len(tech_valid_docs1)
num_valid_3 = num_valid_2 + len(comics_valid_docs1)
num_valid_4 = num_valid_3 + len(film_valid_docs1)
num_valid_5 = num_valid_4 + len(music_valid_docs1)
num_valid_6 = num_valid_5 + len(photography_valid_docs1)
                    

# Text pre-processing, including stop word removal, stemming, etc.
all_valid_docs2 = [[w.lower() for w in doc] for doc in all_valid_docs]
all_valid_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in all_valid_docs2]
all_valid_docs4 = [[w for w in doc if w not in stop_list] for doc in all_valid_docs3]
all_valid_docs5 = [[stemmer.stem(w) for w in doc] for doc in all_valid_docs4]

# Note that we're using the dictionary created earlier.
all_valid_tf_vectors = [dictionary.doc2bow(doc) for doc in all_valid_docs5]

# Convert documents into dict representation.
all_valid_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_valid_tf_vectors]

# Separate the two sets of documents and add labels.
art_valid_data_with_labels = [(d, 'art') for d in all_valid_data_as_dict[0:num_art_valid_docs]]
tech_valid_data_with_labels  = [(d, 'tech') for d in all_valid_data_as_dict[num_art_valid_docs:num_valid_2]]
comics_valid_data_with_labels  = [(d, 'comics') for d in all_valid_data_as_dict[num_valid_2:num_valid_3]]
film_valid_data_with_labels  = [(d, 'film') for d in all_valid_data_as_dict[num_valid_3:num_valid_4]]
music_valid_data_with_labels  = [(d, 'music') for d in all_valid_data_as_dict[num_valid_4:num_valid_5]]
photography_valid_data_with_labels  = [(d, 'photography') for d in all_valid_data_as_dict[num_valid_5:num_valid_6]]
publishing_valid_data_with_labels  = [(d, 'publishing') for d in all_valid_data_as_dict[num_valid_6:]]


# Combine the labeled documents.
all_valid_data_with_labels = art_valid_data_with_labels + tech_valid_data_with_labels + comics_valid_data_with_labels + film_valid_data_with_labels + music_valid_data_with_labels + photography_valid_data_with_labels + publishing_valid_data_with_labels 

In [11]:
print(nltk.classify.accuracy(classifier, all_valid_data_with_labels))

0.9928571428571429


## Mode Testing - Predicting labels for other documents

In [12]:
#Read the text files
test_corpus = PlaintextCorpusReader('data/Test', '.+\.txt')
fids = test_corpus.fileids()

# Tokenization
test_docs1 = [test_corpus.words(fid) for fid in fids]

# Text pre-processing, including stop word removal, stemming, etc.
test_docs2 = [[w.lower() for w in doc] for doc in test_docs1]
test_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in test_docs2]
test_docs4 = [[w for w in doc if w not in stop_list] for doc in test_docs3]
test_docs5 = [[stemmer.stem(w) for w in doc] for doc in test_docs4]

# Note that we're using the dictionary created earlier to create TF vectors
test_tf_vectors = [dictionary.doc2bow(doc) for doc in test_docs5]

# Convert documents into dict representation. This is document-label representation
test_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in test_tf_vectors]

#For each file, classify and print the label.
for i in range(len(fids)):
    print(fids[i], '-->', classifier.classify(test_data_as_dict[i]))


art_2_000001.txt --> art
art_2_000002.txt --> art
art_2_000003.txt --> art
art_2_000004.txt --> art
art_2_000005.txt --> art
art_2_000006.txt --> art
art_2_000007.txt --> art
art_2_000008.txt --> art
art_2_000009.txt --> art
art_2_000010.txt --> art
comics_10_000001.txt --> comics
comics_10_000002.txt --> publishing
comics_10_000003.txt --> comics
comics_10_000004.txt --> comics
comics_10_000005.txt --> comics
comics_10_000006.txt --> comics
comics_10_000007.txt --> comics
comics_10_000008.txt --> comics
comics_10_000009.txt --> comics
comics_10_000010.txt --> comics
film_4_000001.txt --> film
film_4_000002.txt --> film
film_4_000003.txt --> film
film_4_000004.txt --> film
film_4_000005.txt --> film
film_4_000006.txt --> film
film_4_000007.txt --> film
film_4_000008.txt --> film
film_4_000009.txt --> film
film_4_000010.txt --> film
music_4_000001.txt --> music
music_4_000002.txt --> music
music_4_000003.txt --> music
music_4_000004.txt --> music
music_4_000005.txt --> music
music_4_000