In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer

In [2]:
def transform_string_merge(df):
    # First transform all the relevant numerical columns to strings
    df["Type"] = df["Type"].apply(lambda x: "Dog" if x == 1 else "Cat")
    
    breed = pd.read_csv("BreedLabels.csv")
    breed_dict = dict(zip(breed["BreedID"], breed["BreedName"]))
    df["Breed1"] = df["Breed1"].map(breed_dict)
    df["Breed2"] = df["Breed2"].map(breed_dict)
    
    gender_dict = {1:"Male", 2:"Female", 3:"Mixed"}
    df["Gender"] = df["Gender"].map(gender_dict)
    
    color = pd.read_csv("ColorLabels.csv")
    color_dict = dict(zip(color["ColorID"], color["ColorName"]))
    df["Color1"] = df["Color1"].map(color_dict)
    df["Color2"] = df["Color2"].map(color_dict)
    df["Color3"] = df["Color3"].map(color_dict)
    
    maturity_dict = {1:"Small", 2:"Medium", 3:"Large", 4:"Extra Large", 0:"Not Specified"}
    df["MaturitySize"] = df["MaturitySize"].map(maturity_dict)
    
    fur_dict = {1:"Short", 2:"Medium", 3:"Long", 0:"Not Specified"}
    df["FurLength"] = df["FurLength"].map(fur_dict)

    binary_dict = {1:"Yes", 2:"No", 3:"Not Sure"}
    df["Vaccinated"] = df["Vaccinated"].map(binary_dict)
    df["Dewormed"] = df["Dewormed"].map(binary_dict)
    df["Sterilized"] = df["Sterilized"].map(binary_dict)

    health_dict = {1:"Healthy", 2:"Minor Injury", 3:"Serious Injury", 0:"Not Specified"}
    df["Health"] = df["Health"].map(health_dict)

    state = pd.read_csv("StateLabels.csv")
    state_dict = dict(zip(state["StateID"], state["StateName"]))
    df["State"] = df["State"].map(state_dict)

    # More clean-up
    df = df.fillna('')
    df = df.where(pd.notnull(df), None)
    
    # Turn all columns into strings and then combine in new column as one string
    all_columns = list(df) # Creates list of all column headers
    df[all_columns] = df[all_columns].astype(str)
    
    # Combine all relevant columns as one string
    df["x_string"] = df[['Name', 
                         'Age', 
                         'Breed1', 
                         'Breed2', 
                         'Gender', 
                         'Color1', 
                         'Color2', 
                         'Color3', 
                         'MaturitySize', 
                         'FurLength', 
                         'Vaccinated', 
                         'Dewormed', 
                         'Sterilized', 
                         'Health', 
                         'Fee', 
                         'State', 
                         'Description']].agg(' '.join, axis=1)
    
    df = df[df.x_string.map(lambda x: x.isascii())]
    df_new = df[["x_string", "AdoptionSpeed"]]
    
    return df_new

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
df = pd.read_csv("train/train.csv", encoding='utf8')

In [5]:
df_processed = transform_string_merge(df)
df_processed.head()

Unnamed: 0,x_string,AdoptionSpeed
0,Nibble 3 Tabby Male Black White Small Short ...,2
1,No Name Yet 1 Domestic Medium Hair Male Black...,0
2,Brisco 1 Mixed Breed Male Brown White Medium...,3
3,Miko 4 Mixed Breed Female Black Brown Medium...,2
4,Hunter 1 Mixed Breed Male Black Medium Shor...,2


In [6]:
# Split processed df into train and split
train, test = train_test_split(df_processed, test_size=0.2, random_state=42, shuffle=True)
test.head()

Unnamed: 0,x_string,AdoptionSpeed
4632,Little Fud 3 Mixed Breed Female Brown Cream ...,1
11874,Hobbes 2 Domestic Short Hair Male Golden Whit...,1
1052,Cookie Girl 4 Mixed Breed Female Black Yellow...,2
9770,Sheena 8 Terrier Female Cream White Small Lo...,2
3773,72 Mixed Breed Mixed Brown Large Short Not...,4


In [7]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [8]:
def tokenize(data):
    tokenized_docs = [word_tokenize(doc) for doc in data]
    alpha_tokens = [[t.lower() for t in doc if t.isalpha() == True] for doc in tokenized_docs]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [[lemmatizer.lemmatize(alpha) for alpha in doc] for doc in alpha_tokens]
    X_stem_as_string = [" ".join(x_t) for x_t in lem_tokens]
    return X_stem_as_string

In [9]:
X_train_tk = tokenize(train["x_string"])
X_test_tk = tokenize(test["x_string"])

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

In [16]:
vct = CountVectorizer(stop_words='english', lowercase=False)
svd = TruncatedSVD(n_components=200, random_state=42)
tfvec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=False)

preprocessing_pipe = Pipeline([
    ('vectorizer', tfvec),
    ('svd', svd)   
])

In [17]:
lsa_train = preprocessing_pipe.fit_transform(X_train_tk)
lsa_train.shape

(11438, 200)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [19]:
rf = RandomForestClassifier()
mb = MultinomialNB()
pipe = Pipeline([
    ('vectorizer', tfvec),
    ('rf', mb)
])

In [20]:
pipe.fit(X_train_tk, train["AdoptionSpeed"])
y_pred = pipe.predict(X_test_tk)

In [21]:
print(classification_report(test["AdoptionSpeed"], y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.46      0.04      0.07       583
           2       0.36      0.55      0.43       779
           3       0.62      0.07      0.13       645
           4       0.40      0.79      0.54       783

    accuracy                           0.39      2860
   macro avg       0.37      0.29      0.23      2860
weighted avg       0.44      0.39      0.31      2860



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tk, train["AdoptionSpeed"])

predicted = clf.predict(test)

ValueError: Expected 2D array, got 1D array instead:
array=['pipi domestic short hair male gray large short yes yes yes healthy kuala lumpur dear all my name is pipi i year old i like to eat everyday having meal which is breakfast supper with dry biscuit and dinner with wet can food i like pink color cause i have a pink color cat i can lay on the floor play it by myself i do like to shower but every month i need to get a insect spray to keep insect away from me i never going step on the street because i am an house born cat i have an microchip at my neck so i will never getting lost i have been train do not claim over table always need to keep the claw so i will never make any one else getting hurt beside i a very clever cat i can understand mandarin too ever night i will going back to my cage to sleep but sometimes i need you to carry me back to the cage'
 'domestic short hair male black white medium short no yes no healthy selangor found it at cheras perdana please call kelly for more info'
 'american curl domestic long hair female cream medium long no yes no healthy selangor ready foe let go cute healthy kitten diet wetfood thnk you'
 ...
 'boo boo domestic medium hair female black gray white medium medium no yes not sure healthy selangor manja dan suka mengesel pandai berdikari pandai main bola dan pantang jumpa benda yang berbentuk bulat sure dia akan main macam messi'
 'german shepherd dog mixed breed female black large medium yes yes no healthy selangor cute puppy for adoption'
 'smokey domestic short hair siamese female brown cream white small short no yes yes healthy selangor loving cat love to sleep n eat'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [15]:
tokenizer = TweetTokenizer()
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
vectorizer.fit(train["x_string"])
X_train = vectorizer.transform(train["x_string"])

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, train['AdoptionSpeed'])

predicted = rf.predict(test)



ValueError: could not convert string to float: 'Little Fud 3 Mixed Breed  Female Brown Cream  Medium Short Yes Yes No Healthy 0 Selangor 3 months old female puppy for adoption. She was rescue with her sister from the trunk toad to Kampung Desa Aman 2 weeks ago. Her sister has been adopted and we hope she too will also find a loving & forever home. Please pm me if interested. 1. Female Mix Bred 2. Dewormed 3. 1st vaccination done 4. Not spay yet (too young) 5. Area : Kepong 6. No fees just your promise to love & care for her till the end.'

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train["x_string"])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, train["AdoptionSpeed"])

predicted = clf.predict(test)