In [14]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import math
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sophiabraz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
file = 'Dataset-Kickstarter-projects-2018.csv'
if file in os.listdir():
    print(f'Encontramos o arquivo {file}, podemos prosseguir!')
else:
    print(f'Não encontramos o arquivo {file} no diretório {os.getcwd()}!')

Encontramos o arquivo Dataset-Kickstarter-projects-2018.csv, podemos prosseguir!


In [3]:
dataset = pd.read_csv(file)
dataset.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
dataset['main_category'] = dataset.loc[:,'main_category'].astype('category')
dataset['currency'] = dataset.loc[:,'currency'].astype('category')
dataset['deadline'] = dataset.loc[:,'deadline'].astype('datetime64')
dataset['launched'] = dataset.loc[:,'launched'].astype('datetime64')
dataset['state'] = dataset.loc[:,'state'].astype('category')
dataset['country'] = dataset.loc[:,'country'].astype('category')

dataset.dtypes

ID                           int64
name                        object
category                    object
main_category             category
currency                  category
deadline            datetime64[ns]
goal                       float64
launched            datetime64[ns]
pledged                    float64
state                     category
backers                      int64
country                   category
usd pledged                float64
usd_pledged_real           float64
usd_goal_real              float64
dtype: object

In [5]:
dataset['launched_Dates'] = pd.to_datetime(dataset['launched']).dt.date
dataset['launched_Time'] = pd.to_datetime(dataset['launched']).dt.time


dataset['deadline'] = pd.to_datetime(dataset['deadline']).dt.date

dataset['delta_time'] = dataset['deadline'] - dataset['launched_Dates']

dataset.drop(columns = ['ID', 'currency', 'pledged', 'usd pledged'])

Unnamed: 0,name,category,main_category,deadline,goal,launched,state,backers,country,usd_pledged_real,usd_goal_real,launched_Dates,launched_Time,delta_time
0,The Songs of Adelaide & Abullah,Poetry,Publishing,2015-10-09,1000.0,2015-08-11 12:12:28,failed,0,GB,0.0,1533.95,2015-08-11,12:12:28,59 days
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,2017-11-01,30000.0,2017-09-02 04:43:57,failed,15,US,2421.0,30000.00,2017-09-02,04:43:57,60 days
2,Where is Hank?,Narrative Film,Film & Video,2013-02-26,45000.0,2013-01-12 00:20:50,failed,3,US,220.0,45000.00,2013-01-12,00:20:50,45 days
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,2012-04-16,5000.0,2012-03-17 03:24:11,failed,1,US,1.0,5000.00,2012-03-17,03:24:11,30 days
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,canceled,14,US,1283.0,19500.00,2015-07-04,08:35:03,56 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,2014-10-17,50000.0,2014-09-17 02:35:30,canceled,1,US,25.0,50000.00,2014-09-17,02:35:30,30 days
378657,The Tribe,Narrative Film,Film & Video,2011-07-19,1500.0,2011-06-22 03:35:14,failed,5,US,155.0,1500.00,2011-06-22,03:35:14,27 days
378658,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,2010-08-16,15000.0,2010-07-01 19:40:30,failed,1,US,20.0,15000.00,2010-07-01,19:40:30,46 days
378659,BioDefense Education Kit,Technology,Technology,2016-02-13,15000.0,2016-01-13 18:13:53,failed,6,US,200.0,15000.00,2016-01-13,18:13:53,31 days


In [6]:
train, test = train_test_split(dataset, test_size=0.2)

In [15]:
import re 


w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def cleanup(text):
    """
        Função de limpeza muito simples que troca alguns sinais básicos por espaços
    """
    #import string
    punctuation = '[!-.:?;"\n"()''"",_%$\|/,<>]' # Note que os sinais [] são delimitadores de um conjunto.
    pattern = re.compile(punctuation)
    text_subbed = re.sub(pattern, '', text)
    
    texto_limpo = []
    
    for w in w_tokenizer.tokenize(text_subbed):
        palavra_limpa = lemmatizer.lemmatize(w)
        texto_limpo.append(palavra_limpa)
        
    
    return texto_limpo


train_limpo = train['name'].apply(cleanup)

TypeError: expected string or bytes-like object

In [None]:
train_limpo