In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Read in csv
df = pd.read_csv("merged_clean5.csv")

In [3]:
# Find shape
df.shape

(11237, 10)

#### Cleaning up:
1.  Creating a new column called "Mix".  Mix=1 if true, Mix=0 if false
2.  Took out the word "Mix" in Breed

In [4]:
# Make a copy and add new column called Mix, set to 0
new_df = df.copy()
new_df["Mix"]=0
new_df.head()

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed_primary,Breed_secondary,Color,Outcome_Type,Outcome_Rating,Fixed,Sex,Mix
0,2.0,months,2.0,Yorkshire Terrier,Rat Terrier,Tricolor,Adoption,Positive,Neutered,Male,0
1,2.0,months,2.0,Yorkshire Terrier,Rat Terrier,Tricolor,Adoption,Positive,Spayed,Female,0
2,2.0,months,2.0,Yorkshire Terrier,Rat Terrier,Tricolor,Adoption,Positive,Spayed,Female,0
3,2.0,years,24.0,Yorkshire Terrier,Pomeranian,Cream/Red,Adoption,Positive,Neutered,Male,0
4,3.0,years,36.0,Yorkshire Terrier,Parson Russell Terrier,Blue/Tan,Adoption,Positive,Neutered,Male,0


In [8]:
new_df.loc[200:300, :]

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed_primary,Breed_secondary,Color,Outcome_Type,Outcome_Rating,Fixed,Sex,Mix
200,2.0,years,24.0,Staffordshire,Blue Lacy,Blue/White,Adoption,Positive,Spayed,Female,0
201,1.0,year,12.0,Staffordshire Mix,,White/Tan,Adoption,Positive,Neutered,Male,0
202,1.0,year,12.0,Staffordshire Mix,,Brown Brindle,Adoption,Positive,Neutered,Male,0
203,1.0,year,12.0,Staffordshire Mix,,White/Brown,Adoption,Positive,Neutered,Male,0
204,1.0,year,12.0,Staffordshire Mix,,Brown Brindle/White,Adoption,Positive,Neutered,Male,0
...,...,...,...,...,...,...,...,...,...,...,...
296,10.0,months,10.0,Siberian Husky Mix,,White/Cream,Adoption,Positive,Neutered,Male,0
297,8.0,months,8.0,Siberian Husky Mix,,White,Adoption,Positive,Neutered,Male,0
298,10.0,months,10.0,Siberian Husky Mix,,White,Adoption,Positive,Neutered,Male,0
299,4.0,months,4.0,Siberian Husky Mix,,Gray/White,Adoption,Positive,Neutered,Male,0


In [9]:
# Find rows containing word "Mix" and set to 1 (for True)
new_df.loc[new_df["Breed_primary"].str.contains("Mix"), "Mix"] = 1

In [6]:
# Check
new_df.loc[200:300, :]

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed_primary,Breed_secondary,Color,Outcome_Type,Outcome_Rating,Fixed,Sex,Mix
200,2.0,years,24.0,Staffordshire,Blue Lacy,Blue/White,Adoption,Positive,Spayed,Female,0
201,1.0,year,12.0,Staffordshire Mix,,White/Tan,Adoption,Positive,Neutered,Male,0
202,1.0,year,12.0,Staffordshire Mix,,Brown Brindle,Adoption,Positive,Neutered,Male,0
203,1.0,year,12.0,Staffordshire Mix,,White/Brown,Adoption,Positive,Neutered,Male,0
204,1.0,year,12.0,Staffordshire Mix,,Brown Brindle/White,Adoption,Positive,Neutered,Male,0
...,...,...,...,...,...,...,...,...,...,...,...
296,10.0,months,10.0,Siberian Husky Mix,,White/Cream,Adoption,Positive,Neutered,Male,0
297,8.0,months,8.0,Siberian Husky Mix,,White,Adoption,Positive,Neutered,Male,0
298,10.0,months,10.0,Siberian Husky Mix,,White,Adoption,Positive,Neutered,Male,0
299,4.0,months,4.0,Siberian Husky Mix,,Gray/White,Adoption,Positive,Neutered,Male,0


In [10]:
# Rename the breed WITHOUT the word MIX
new_df.loc[new_df['Breed_primary'].str.contains('Mix'), 'Breed_primary'] = new_df.loc[df['Breed_primary'].str.contains('Mix'), 'Breed_primary'].str.replace(' Mix', '')

In [11]:
# Verify that the word 'Mix' is no longer part of the description if Mix is equal to 1
new_df.loc[200:300, :]

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed_primary,Breed_secondary,Color,Outcome_Type,Outcome_Rating,Fixed,Sex,Mix
200,2.0,years,24.0,Staffordshire,Blue Lacy,Blue/White,Adoption,Positive,Spayed,Female,0
201,1.0,year,12.0,Staffordshire,,White/Tan,Adoption,Positive,Neutered,Male,1
202,1.0,year,12.0,Staffordshire,,Brown Brindle,Adoption,Positive,Neutered,Male,1
203,1.0,year,12.0,Staffordshire,,White/Brown,Adoption,Positive,Neutered,Male,1
204,1.0,year,12.0,Staffordshire,,Brown Brindle/White,Adoption,Positive,Neutered,Male,1
...,...,...,...,...,...,...,...,...,...,...,...
296,10.0,months,10.0,Siberian Husky,,White/Cream,Adoption,Positive,Neutered,Male,1
297,8.0,months,8.0,Siberian Husky,,White,Adoption,Positive,Neutered,Male,1
298,10.0,months,10.0,Siberian Husky,,White,Adoption,Positive,Neutered,Male,1
299,4.0,months,4.0,Siberian Husky,,Gray/White,Adoption,Positive,Neutered,Male,1


In [12]:
new_df.shape

(11237, 11)

#### Dropping null values in Outcome_Type Column

In [15]:
# Drop rows where Outcome_Rating is null

new_df2 = new_df.loc[df.Outcome_Rating.notna(), ["Outcome_Rating", "Pet_ageMonths", "Breed_primary", "Breed_secondary" ,"Color", "Sex", "Fixed", "Mix"]]

In [16]:
new_df2.loc[200:300, ]

Unnamed: 0,Outcome_Rating,Pet_ageMonths,Breed_primary,Breed_secondary,Color,Sex,Fixed,Mix
200,Positive,24.0,Staffordshire,Blue Lacy,Blue/White,Female,Spayed,0
201,Positive,12.0,Staffordshire,,White/Tan,Male,Neutered,1
202,Positive,12.0,Staffordshire,,Brown Brindle,Male,Neutered,1
203,Positive,12.0,Staffordshire,,White/Brown,Male,Neutered,1
204,Positive,12.0,Staffordshire,,Brown Brindle/White,Male,Neutered,1
...,...,...,...,...,...,...,...,...
296,Positive,10.0,Siberian Husky,,White/Cream,Male,Neutered,1
297,Positive,8.0,Siberian Husky,,White,Male,Neutered,1
298,Positive,10.0,Siberian Husky,,White,Male,Neutered,1
299,Positive,4.0,Siberian Husky,,Gray/White,Male,Neutered,1


In [17]:
# Verify shape
new_df2.shape

(11237, 8)

In [158]:
# new_df2.groupby('Breed').count().sort_values("Mix", ascending=True)

Unnamed: 0_level_0,Outcome_Type,Pet_ageMonths,Color,Sex,Fixed,Mix
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
French Bulldog/Miniature Schnauzer,1,1,1,1,1,1
Great Dane/German Shepherd,1,1,1,1,1,1
Great Dane/Pit Bull,1,1,1,1,1,1
Great Dane/Staffordshire,1,1,1,1,1,1
Great Pyrenees/Australian Shepherd,1,1,1,1,1,1
...,...,...,...,...,...,...
Australian Cattle Dog,392,392,392,392,392,392
German Shepherd,650,650,650,650,650,650
Labrador Retriever,1414,1414,1414,1414,1414,1414
Chihuahua Shorthair,1840,1840,1840,1840,1840,1840


In [162]:
# new_df2.groupby('Breed').filter(lambda x: len(x) > 10)

Unnamed: 0,Outcome_Type,Pet_ageMonths,Breed,Color,Sex,Fixed,Mix
2,Return_to_owner,0.03,Pit Bull,White/Black,Male,Intact,1
3,Return_to_owner,0.03,Pit Bull,White/Brown Brindle,Male,Intact,1
4,Return_to_owner,0.03,Pit Bull,Brown Brindle/White,Female,Intact,1
5,Return_to_owner,0.03,Pit Bull,White/Brown Brindle,Male,Intact,1
6,Return_to_owner,0.03,Pit Bull,White/Brown,Female,Intact,1
...,...,...,...,...,...,...,...
22133,Return_to_owner,48.00,Labrador Retriever,Black,Female,Spayed,1
22134,Return_to_owner,60.00,Pit Bull,White,Female,Spayed,1
22135,Adoption,24.00,Carolina Dog,Red,Female,Spayed,1
22136,Euthanasia,108.00,German Shepherd/Labrador Retriever,Black/Brown,Female,Spayed,0


#### get_dummies - Changing to numeric values

In [18]:
# Passing get_dummies on all columns except Outcome_rating and Pet_ageMonths
new_df3 = pd.get_dummies(data=new_df2, columns=['Breed_primary', 'Breed_secondary', 'Color', 'Sex', 'Fixed', 'Mix'])


In [19]:
new_df3.head()

Unnamed: 0,Outcome_Rating,Pet_ageMonths,Breed_primary_Affenpinscher,Breed_primary_Airedale Terrier,Breed_primary_Akita,Breed_primary_Alaskan Husky,Breed_primary_Alaskan Malamute,Breed_primary_American Bulldog,Breed_primary_American Eskimo,Breed_primary_American Foxhound,...,Color_Yellow Brindle/White,Color_Yellow/Black,Color_Yellow/White,Sex_Female,Sex_Male,Fixed_Intact,Fixed_Neutered,Fixed_Spayed,Mix_0,Mix_1
0,Positive,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,Positive,2.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
2,Positive,2.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
3,Positive,24.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
4,Positive,36.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [20]:
new_df3.shape

(11237, 551)

In [21]:
# Drop these columns as they are duplicates
new_df3 = new_df3.drop(columns=['Sex_Female', 'Mix_0'])


In [22]:
# Verify that 2 columns dropped
new_df3.shape

(11237, 549)

In [23]:
# Define X and y
X = new_df3.drop(['Outcome_Rating'], axis="columns")
y = new_df3[['Outcome_Rating']]

In [24]:
X.shape

(11237, 548)

In [25]:
y.shape

(11237, 1)

### random forest

In [26]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [27]:
# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 10) 
rf_model.fit(X_train, y_train)
predict_y_test = rf_model.predict(X_test)

  after removing the cwd from sys.path.


In [28]:
# Validate - run accuracy score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test)) # actual test vs new predict test

Accuracy score:  0.7320284697508896


In [29]:
# Validate with cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X, y, cv=10, scoring="accuracy")
print(scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.40658363 0.29270463 0.3113879  0.35053381 0.31227758 0.2366548
 0.27224199 0.32324132 0.32947462 0.4826358 ]


In [30]:
scores.mean()

0.33177360780573134


### Logistic Regression

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [33]:
logReg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
print(f"Training Data Score: {logReg.score(X_train, y_train)}")
print(f"Testing Data Score: {logReg.score(X_test, y_test)}")

Training Data Score: 0.7745342351963925
Testing Data Score: 0.7654804270462633


In [None]:
predictions = logReg.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10]}")

In [None]:
 pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)