## Content-based recommender for kickstarter projects

In [1]:
# run this statement only once to install Rake
!pip install rake_nltk



In [2]:
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import re, nltk, gensim
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
from nltk.corpus import stopwords
from string import punctuation


### Step 1: Read in and analyse the data

In [3]:
import pandas as pd
import glob

#Dataset
from nltk.corpus import PlaintextCorpusReader

art = PlaintextCorpusReader('data/Train/Art', '.+\.txt')
tech = PlaintextCorpusReader('data/Train/Tech', '.+\.txt')

art_docs1 = [art.words(fid) for fid in art.fileids()]
tech_docs1 = [tech.words(fid) for fid in tech.fileids()]

print(art_docs1[0][0:20])
print(tech_docs1[0][0:20])


#DY dataset 
path = r'./data/' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

# df = pd.read_csv('Kickstarter057.csv')

df = frame
df.head()

['A', 'student', 'design', 'build', 'studio', 'blurring', 'the', 'boundaries', 'of', 'architecture', 'and', 'ceramics', '(', 'and', 'birds', ').', 'A', 'patch', 'commemorating', 'the']
['CLA', 'is', 'an', 'innovative', 'product', 'designed', 'to', 'provide', 'real', 'time', 'legal', 'resources', 'to', 'largely', 'underserved', 'individuals', '.', 'The', 'LightSpike', 'is']


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Final Main,Unnamed: 38,backers_count,blub,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,art,,27,,A student design build studio blurring the bou...,"{""id"":25,""name"":""Sculpture"",""slug"":""art/sculpt...",1369,US,the United States,1334085321,...,bird-wall,https://www.kickstarter.com/discover/categorie...,True,False,successful,1335762019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1369.0,international
1,art,,54,,A patch commemorating the SN8 flight from Boca...,"{""id"":1,""name"":""Art"",""slug"":""art"",""position"":1...",1229,US,the United States,1602983631,...,spacex-sn8-starship-prototype-12km-hop-patch,https://www.kickstarter.com/discover/categorie...,True,False,successful,1608170400,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1229.0,international
2,art,,63,,We are creating TRM movie posters for distribu...,"{""id"":21,""name"":""Digital Art"",""slug"":""art/digi...",1637,US,the United States,1341415158,...,the-real-maine-movie-poster,https://www.kickstarter.com/discover/categorie...,True,False,successful,1342238341,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1637.5,international
3,art,,26,,A video performance piece for the self-designe...,"{""id"":24,""name"":""Performance Art"",""slug"":""art/...",2750,US,the United States,1307480670,...,voicing-islam-video-performance-piece,https://www.kickstarter.com/discover/categorie...,True,False,successful,1310767233,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2750.0,international
4,art,,2,,Providing young artists with creative sponsors...,"{""id"":288,""name"":""Installations"",""slug"":""art/i...",75,US,the United States,1519066595,...,the-daughters-of-revolution-project,https://www.kickstarter.com/discover/categorie...,False,False,failed,1522253573,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",75.0,international


In [4]:
# Combine the categories of the corpus
all_docs1 = art_docs1 + tech_docs1
num_art_docs = len(art_docs1)
print(num_art_docs)
print (len(tech_docs1))

# Processsing for stopwords, alphabetic words, Stemming 

all_docs2 = [[w.lower() for w in doc] for doc in all_docs1]

import re
all_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in all_docs2]

from nltk.corpus import stopwords
stop_list = stopwords.words('english')
all_docs4 = [[w for w in doc if w not in stop_list] for doc in all_docs3]

from nltk.stem.porter import *
stemmer = PorterStemmer()
all_docs5 = [[stemmer.stem(w) for w in doc] for doc in all_docs4]

#Create dictionary
from gensim import corpora
dictionary = corpora.Dictionary(all_docs5)
print(dictionary)

# Convert all documents to TF Vectors
all_tf_vectors = [dictionary.doc2bow(doc) for doc in all_docs5]

#Label the trained data. Since the folder name is the label, I use the same labels.

all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]
print(type(all_data_as_dict))

#print(all_data_as_dict)

art_data = [(d, 'ART') for d in all_data_as_dict[0:num_art_docs]]
tech_data = [(d, 'TECH') for d in all_data_as_dict[num_art_docs:]]
all_labeled_data = art_data + tech_data

#Generate the trained classifier
classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)

test_doc = all_data_as_dict[4] #35 is crime article. Until 99 are all crime
#print(all_data_as_dict[0])
print(classifier.classify(test_doc))


#Processing data for DY
def extract_cat(text):
    text = text.split(",")
    
    text = text[1] + " " +text[2]
    

    text = text.replace ("/", " ")
    
    
    text = text.replace ("name", "")
    text = text.replace ("slug", "")
    
    text = text.replace ('"', "")
    text = text.replace ('{', "")
    text = text.replace (':', "")
    
    text = text.lower()
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text) # matches all whitespace characters
    text = text.strip(' ')
    return text



df['category'] = df['category'].apply(lambda x: extract_cat(x))


df.head()



1
1
Dictionary(5784 unique tokens: ['aalborg', 'ab', 'abaddon', 'abalon', 'abandon']...)
<class 'list'>


IndexError: list index out of range

In [None]:

df = df[['name','category','blurb']]
df.head()


In [None]:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df['blurb']=df['blurb'].map(lambda s:preprocess(s)) 
df.head()

In [None]:
dfcat = df['category']

dfcat.head 

num_dfcat = len(dfcat)

#print(num_dfcat)

from nltk.tokenize import word_tokenize

dfcat = dfcat.apply(word_tokenize)
dfcat.head()


In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(dfcat)

print(dictionary)