# Loading Data

In [22]:
import pandas as pd
import seaborn  as sb 
import matplotlib 
import csv
from scipy.stats import chi2_contingency

# read in the data from the directory /data
df = pd.read_csv("data/nyc_squirrels.csv", encoding="utf-8")
print(df)


           long        lat unique_squirrel_id hectare shift      date  \
0    -73.956134  40.794082     37F-PM-1014-03     37F    PM  10142018   
1    -73.957044  40.794851     37E-PM-1006-03     37E    PM  10062018   
2    -73.976831  40.766718      2E-AM-1010-03     02E    AM  10102018   
3    -73.975725  40.769703      5D-PM-1018-05     05D    PM  10182018   
4    -73.959313  40.797533     39B-AM-1018-01     39B    AM  10182018   
...         ...        ...                ...     ...   ...       ...   
3018 -73.963943  40.790868     30B-AM-1007-04     30B    AM  10072018   
3019 -73.970402  40.782560     19A-PM-1013-05     19A    PM  10132018   
3020 -73.966587  40.783678     22D-PM-1012-07     22D    PM  10122018   
3021 -73.963994  40.789915     29B-PM-1010-02     29B    PM  10102018   
3022 -73.975479  40.769640      5E-PM-1012-01     05E    PM  10122018   

      hectare_squirrel_number       age primary_fur_color  \
0                           3       NaN               NaN   
1

# Pre-Processing Dataset
- Reference 
-- https://realpython.com/python-data-cleaning-numpy-pandas/

In [21]:
new_df = df.dropna(axis='columns'); # dropping columns with null values 
print(df)

           long        lat unique_squirrel_id hectare shift      date  \
0    -73.956134  40.794082     37F-PM-1014-03     37F    PM  10142018   
1    -73.957044  40.794851     37E-PM-1006-03     37E    PM  10062018   
2    -73.976831  40.766718      2E-AM-1010-03     02E    AM  10102018   
3    -73.975725  40.769703      5D-PM-1018-05     05D    PM  10182018   
4    -73.959313  40.797533     39B-AM-1018-01     39B    AM  10182018   
...         ...        ...                ...     ...   ...       ...   
3018 -73.963943  40.790868     30B-AM-1007-04     30B    AM  10072018   
3019 -73.970402  40.782560     19A-PM-1013-05     19A    PM  10132018   
3020 -73.966587  40.783678     22D-PM-1012-07     22D    PM  10122018   
3021 -73.963994  40.789915     29B-PM-1010-02     29B    PM  10102018   
3022 -73.975479  40.769640      5E-PM-1012-01     05E    PM  10122018   

      hectare_squirrel_number       age primary_fur_color  \
0                           3       NaN               NaN   
1

# Hypothesis and Testing Methods 
#### markdown only 

Is there a significant difference in squirrel sightings between different hectare codes? (Chi-square test)

Is there a significant difference in the number of unique squirrels sighted between AM and PM shifts? (Two-sample t-test)

Is there a significant difference in the average number of hectare_squirrel_number per hectare across different community districts? (One-way ANOVA)

Is there an association between squirrel age groups (Adult, Juvenile, etc.) and their behaviors (approaches, indifferent, runs_from)? (Chi-square test)

Are the proportions of different primary and highlight fur colors observed in the dataset significantly different from each other? (Chi-square test)

Is there a relationship between the proportion of squirrels with different behaviors (approaches, indifferent, runs_from) and their fur colors? (Logistic regression)

Do different community districts, borough boundaries, city council districts, and police precincts have significantly different proportions of squirrel sightings? (Chi-square test)

Are there any spatial autocorrelation patterns in squirrel sightings based on their latitude and longitude coordinates? (Moran's I)

Is there a significant difference in the number of squirrel sightings between different dates of the year, possibly indicating seasonality effects? (Time series analysis or Kruskal-Wallis test)

In [33]:
# Create a contingency table
contingency_table = pd.crosstab(df['age'], df[['moans', 'foraging', 'chasing']].idxmax(axis=1))

# Perform Chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Print results
print(f"Chi2 value: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("There is a significant association between squirrel age groups and their behaviors.")
else:
    print("There is no significant association between squirrel age groups and their behaviors.")

Chi2 value: 26.6123
P-value: 0.0000
There is a significant association between squirrel age groups and their behaviors.
