In [137]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [138]:
# Read in csv
df = pd.read_csv("merged_clean3.csv")

In [139]:
# Find shape
df.shape

(22139, 8)

#### Cleaning up:
1.  Creating a new column called "Mix".  Mix=1 if true, Mix=0 if false
2.  Took out the word "Mix" in Breed

In [140]:
# Make a copy and add new column called Mix, set to 0
new_df = df.copy()
new_df["Mix"]=0
new_df.head()

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed,Color,Outcome_Type,Fixed,Sex,Mix
0,1.0,day,0.03,Chihuahua Shorthair Mix,Tricolor,,Intact,Male,0
1,1.0,day,0.03,Pit Bull Mix,White,,Intact,Female,0
2,1.0,day,0.03,Pit Bull Mix,White/Black,Return_to_owner,Intact,Male,0
3,1.0,day,0.03,Pit Bull Mix,White/Brown Brindle,Return_to_owner,Intact,Male,0
4,1.0,day,0.03,Pit Bull Mix,Brown Brindle/White,Return_to_owner,Intact,Female,0


In [141]:
# Find rows containing word "Mix" and set to 1 (for True)
new_df.loc[new_df["Breed"].str.contains("Mix"), "Mix"] = 1

In [142]:
# Check
new_df.loc[200:300, :]

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed,Color,Outcome_Type,Fixed,Sex,Mix
200,1.0,month,1.0,Plott Hound Mix,Brown/Black,Transfer,Intact,Male,1
201,1.0,month,1.0,Plott Hound Mix,Red,Transfer,Intact,Female,1
202,1.0,month,1.0,Chihuahua Shorthair Mix,Black/Brown,Euthanasia,Intact,Male,1
203,1.0,month,1.0,Chihuahua Shorthair Mix,Tricolor,Transfer,Intact,Female,1
204,1.0,month,1.0,Dachshund Mix,Brown/White,Transfer,Intact,Female,1
...,...,...,...,...,...,...,...,...,...
296,1.0,month,1.0,Whippet/Dachshund,White,Transfer,Intact,Female,0
297,1.0,month,1.0,Toy Poodle Mix,Blue/White,Transfer,Intact,Male,1
298,1.0,month,1.0,Australian Shepherd Mix,Tricolor,Adoption,Intact,Male,1
299,1.0,month,1.0,Pit Bull Mix,Black,Transfer,Intact,Male,1


In [143]:
# Rename the breed WITHOUT the word MIX
new_df.loc[new_df['Breed'].str.contains('Mix'), 'Breed'] = new_df.loc[df['Breed'].str.contains('Mix'), 'Breed'].str.replace(' Mix', '')

In [144]:
# Verify that the word 'Mix' is no longer part of the description if Mix is equal to 1
new_df.loc[200:300, :]

Unnamed: 0,CSV_Age,Day_mth_year_cat,Pet_ageMonths,Breed,Color,Outcome_Type,Fixed,Sex,Mix
200,1.0,month,1.0,Plott Hound,Brown/Black,Transfer,Intact,Male,1
201,1.0,month,1.0,Plott Hound,Red,Transfer,Intact,Female,1
202,1.0,month,1.0,Chihuahua Shorthair,Black/Brown,Euthanasia,Intact,Male,1
203,1.0,month,1.0,Chihuahua Shorthair,Tricolor,Transfer,Intact,Female,1
204,1.0,month,1.0,Dachshund,Brown/White,Transfer,Intact,Female,1
...,...,...,...,...,...,...,...,...,...
296,1.0,month,1.0,Whippet/Dachshund,White,Transfer,Intact,Female,0
297,1.0,month,1.0,Toy Poodle,Blue/White,Transfer,Intact,Male,1
298,1.0,month,1.0,Australian Shepherd,Tricolor,Adoption,Intact,Male,1
299,1.0,month,1.0,Pit Bull,Black,Transfer,Intact,Male,1


In [145]:
new_df.shape

(22139, 9)

#### Dropping null values in Outcome_Type Column

In [146]:
# Drop rows where Outcome_Type is null

new_df2 = new_df.loc[df.Outcome_Type.notna(), ["Outcome_Type", "Pet_ageMonths", "Breed", "Color", "Sex", "Fixed", "Mix"]]

In [151]:
new_df2.loc[200:300, ]

Unnamed: 0,Outcome_Type,Pet_ageMonths,Breed,Color,Sex,Fixed,Mix
200,Transfer,1.0,Plott Hound,Brown/Black,Male,Intact,1
201,Transfer,1.0,Plott Hound,Red,Female,Intact,1
202,Euthanasia,1.0,Chihuahua Shorthair,Black/Brown,Male,Intact,1
203,Transfer,1.0,Chihuahua Shorthair,Tricolor,Female,Intact,1
204,Transfer,1.0,Dachshund,Brown/White,Female,Intact,1
...,...,...,...,...,...,...,...
296,Transfer,1.0,Whippet/Dachshund,White,Female,Intact,0
297,Transfer,1.0,Toy Poodle,Blue/White,Male,Intact,1
298,Adoption,1.0,Australian Shepherd,Tricolor,Male,Intact,1
299,Transfer,1.0,Pit Bull,Black,Male,Intact,1


In [163]:
# Verify shape
new_df2.shape

(15517, 7)

In [158]:
# new_df2.groupby('Breed').count().sort_values("Mix", ascending=True)

Unnamed: 0_level_0,Outcome_Type,Pet_ageMonths,Color,Sex,Fixed,Mix
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
French Bulldog/Miniature Schnauzer,1,1,1,1,1,1
Great Dane/German Shepherd,1,1,1,1,1,1
Great Dane/Pit Bull,1,1,1,1,1,1
Great Dane/Staffordshire,1,1,1,1,1,1
Great Pyrenees/Australian Shepherd,1,1,1,1,1,1
...,...,...,...,...,...,...
Australian Cattle Dog,392,392,392,392,392,392
German Shepherd,650,650,650,650,650,650
Labrador Retriever,1414,1414,1414,1414,1414,1414
Chihuahua Shorthair,1840,1840,1840,1840,1840,1840


In [162]:
# new_df2.groupby('Breed').filter(lambda x: len(x) > 10)

Unnamed: 0,Outcome_Type,Pet_ageMonths,Breed,Color,Sex,Fixed,Mix
2,Return_to_owner,0.03,Pit Bull,White/Black,Male,Intact,1
3,Return_to_owner,0.03,Pit Bull,White/Brown Brindle,Male,Intact,1
4,Return_to_owner,0.03,Pit Bull,Brown Brindle/White,Female,Intact,1
5,Return_to_owner,0.03,Pit Bull,White/Brown Brindle,Male,Intact,1
6,Return_to_owner,0.03,Pit Bull,White/Brown,Female,Intact,1
...,...,...,...,...,...,...,...
22133,Return_to_owner,48.00,Labrador Retriever,Black,Female,Spayed,1
22134,Return_to_owner,60.00,Pit Bull,White,Female,Spayed,1
22135,Adoption,24.00,Carolina Dog,Red,Female,Spayed,1
22136,Euthanasia,108.00,German Shepherd/Labrador Retriever,Black/Brown,Female,Spayed,0


#### get_dummies - Changing to numeric values

In [168]:
# Passing get_dummies on all columns except Outcome_Type and Pet_ageMonths
new_df3 = pd.get_dummies(data=new_df2, columns=['Breed', 'Color', 'Sex', 'Fixed', 'Mix'])


In [169]:
new_df3.head()

Unnamed: 0,Outcome_Type,Pet_ageMonths,Breed_Affenpinscher,Breed_Afghan Hound,Breed_Airedale Terrier,Breed_Airedale Terrier/Labrador Retriever,Breed_Airedale Terrier/Miniature Schnauzer,Breed_Akita,Breed_Akita/Australian Cattle Dog,Breed_Akita/Chow Chow,...,Color_Yellow/Black,Color_Yellow/White,Color_Yellow/Yellow,Sex_Female,Sex_Male,Fixed_Intact,Fixed_Neutered,Fixed_Spayed,Mix_0,Mix_1
2,Return_to_owner,0.03,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
3,Return_to_owner,0.03,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,Return_to_owner,0.03,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
5,Return_to_owner,0.03,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
6,Return_to_owner,0.03,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


In [170]:
new_df3.shape

(15517, 1468)

In [172]:
# Drop these columns as they are duplicates
new_df3 = new_df3.drop(columns=['Sex_Female', 'Mix_0'])


In [174]:
# Verify that 2 columns dropped
new_df3.shape

(15517, 1466)

In [177]:
# Define X and y
X = new_df3.drop(['Outcome_Type'], axis="columns")
y = new_df3[['Outcome_Type']]

In [178]:
X.shape

(15517, 1465)

In [179]:
y.shape

(15517, 1)

### random forest

In [183]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [184]:
# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 10) 
rf_model.fit(X_train, y_train)
predict_y_test = rf_model.predict(X_test)

  after removing the cwd from sys.path.


In [182]:
# Validate - run accuracy score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test)) # actual test vs new predict test

Accuracy score:  0.5162371134020619


In [185]:
# Validate with cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X, y, cv=10, scoring="accuracy")
print(scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.53350515 0.56701031 0.25902062 0.35889175 0.34407216 0.45167526
 0.43556701 0.14700193 0.379755   0.38426821]


In [186]:
scores.mean()

0.38607674131089353


### Logistic Regression

In [187]:
 from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [188]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [189]:
logReg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [190]:
print(f"Training Data Score: {logReg.score(X_train, y_train)}")
print(f"Testing Data Score: {logReg.score(X_test, y_test)}")

Training Data Score: 0.5573601443671049
Testing Data Score: 0.5494845360824743


In [None]:
predictions = logReg.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10]}")

In [None]:
 pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)