In [223]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [205]:
def transform_string_merge(df):
    # First transform all the relevant numerical columns to strings
    df["Type"] = df["Type"].apply(lambda x: "Dog" if x == 1 else "Cat")
    
    breed = pd.read_csv("BreedLabels.csv")
    breed_dict = dict(zip(breed["BreedID"], breed["BreedName"]))
    df["Breed1"] = df["Breed1"].map(breed_dict)
    df["Breed2"] = df["Breed2"].map(breed_dict)
    
    gender_dict = {1:"Male", 2:"Female", 3:"Mixed"}
    df["Gender"] = df["Gender"].map(gender_dict)
    
    color = pd.read_csv("ColorLabels.csv")
    color_dict = dict(zip(color["ColorID"], color["ColorName"]))
    df["Color1"] = df["Color1"].map(color_dict)
    df["Color2"] = df["Color2"].map(color_dict)
    df["Color3"] = df["Color3"].map(color_dict)
    
    maturity_dict = {1:"Small", 2:"Medium", 3:"Large", 4:"Extra Large", 0:"Not Specified"}
    df["MaturitySize"] = df["MaturitySize"].map(maturity_dict)
    
    fur_dict = {1:"Short", 2:"Medium", 3:"Long", 0:"Not Specified"}
    df["FurLength"] = df["FurLength"].map(fur_dict)

    binary_dict = {1:"Yes", 2:"No", 3:"Not Sure"}
    df["Vaccinated"] = df["Vaccinated"].map(binary_dict)
    df["Dewormed"] = df["Dewormed"].map(binary_dict)
    df["Sterilized"] = df["Sterilized"].map(binary_dict)

    health_dict = {1:"Healthy", 2:"Minor Injury", 3:"Serious Injury", 0:"Not Specified"}
    df["Health"] = df["Health"].map(health_dict)

    state = pd.read_csv("StateLabels.csv")
    state_dict = dict(zip(state["StateID"], state["StateName"]))
    df["State"] = df["State"].map(state_dict)

    # Turn all columns into strings and then combine in new column as one string
    all_columns = list(df) # Creates list of all column headers
    df[all_columns] = df[all_columns].astype(str)
    # Combine all relevant columns as one string
    df["x_string"] = df[['Name', 
                         'Age', 
                         'Breed1', 
                         'Breed2', 
                         'Gender', 
                         'Color1', 
                         'Color2', 
                         'Color3', 
                         'MaturitySize', 
                         'FurLength', 
                         'Vaccinated', 
                         'Dewormed', 
                         'Sterilized', 
                         'Health', 
                         'Fee', 
                         'State', 
                         'Description']].agg(' '.join, axis=1)

    df_new = df[["x_string", "AdoptionSpeed"]]
    
    return df_new

In [206]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [211]:
train = pd.read_csv("train/train.csv")
test = pd.read_csv("test/test.csv")

In [212]:
train_processed = transform_string_merge(train)
train_processed.head()

Unnamed: 0,x_string,AdoptionSpeed
0,Nibble 3 Tabby nan Male Black White nan Small ...,2
1,No Name Yet 1 Domestic Medium Hair nan Male Bl...,0
2,Brisco 1 Mixed Breed nan Male Brown White nan ...,3
3,Miko 4 Mixed Breed nan Female Black Brown nan ...,2
4,Hunter 1 Mixed Breed nan Male Black nan nan Me...,2


In [222]:
# Further cleanup and removing non-english rows (Chinsese and Malay) -> Could be added to function
train_processed = train_processed.fillna('')
train_processed = train_processed[train_processed.x_string.map(lambda x: x.isascii())]
train.info()
train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Type           14993 non-null  object
 1   Name           14993 non-null  object
 2   Age            14993 non-null  object
 3   Breed1         14993 non-null  object
 4   Breed2         14993 non-null  object
 5   Gender         14993 non-null  object
 6   Color1         14993 non-null  object
 7   Color2         14993 non-null  object
 8   Color3         14993 non-null  object
 9   MaturitySize   14993 non-null  object
 10  FurLength      14993 non-null  object
 11  Vaccinated     14993 non-null  object
 12  Dewormed       14993 non-null  object
 13  Sterilized     14993 non-null  object
 14  Health         14993 non-null  object
 15  Quantity       14993 non-null  object
 16  Fee            14993 non-null  object
 17  State          14993 non-null  object
 18  RescuerID      14993 non-n

In [215]:
# Feature extraction and clean-up using bag-of-words model
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_processed["x_string"])
X_train_counts.shape

(14993, 23563)