# Imports

In [None]:
import requests
import json
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

RUN_FROM = 'uni_wifi' #'bastion'

if RUN_FROM == 'bastion' : URL, HEADERS = 'http://fission:31001/', None
if RUN_FROM == 'uni_wifi': URL, HEADERS =  'http://172.26.135.52:9090/', {'HOST': 'fission'}

# Functions

In [None]:
def weather_to_pd(station_id: str, start_year: int, end_year: int, verb=False) -> pd.DataFrame:
    resp_dict = json.loads(requests.get(URL+f'weather/{station_id}/{start_year}/{end_year}', headers=HEADERS).text)
    data = resp_dict['Data']
    if verb : print(f'Called weather api, fetched {len(data)} lines')
    return pd.DataFrame.from_records(data)

In [None]:
def get_stream_to_pd(api: str, station_id: str, size: int, radius_km: int, verb=False) -> pd.DataFrame:
    resp_dict = json.loads(requests.get(URL+api+f'/{station_id}/{size}/{radius_km}', headers=HEADERS).text)

    count=0
    status, token, new_data = resp_dict['Status'], resp_dict['Token'], resp_dict['Data']
    data = [new_data[i]['_source'] for i in range(len(new_data))]
    if verb : print(f'Called {api} api, fetched {len(new_data)} lines')


    while (status == 200) and (new_data != []) :
        count+=1
        resp_dict = json.loads(requests.get(URL+f'stream/'+token, headers=HEADERS).text)
        status, token, new_data = resp_dict['Status'], resp_dict['Token'], resp_dict['Data']
        if verb : print(f'Called stream {count} times, fetched {len(new_data)} new lines')
        data += [new_data[i]['_source'] for i in range(len(new_data))]

    if verb: print(f'Fetched a total of {len(data)}lines')
    return pd.DataFrame.from_records(data)

# Tests

## Joined data creation

In [None]:
df_weather_full = weather_to_pd(station_id='95003', start_year =2014, end_year=2019, verb=True)

In [None]:
df_weather = df_weather_full.copy()

weather_num_cols = ['UV', 'Min Temp', 'Max Temp', 'WindSpeed', 'Min Humid', 'Max Humid', 'Rain', 'Pan-Rain', 'Evapo-Rain']

for col in weather_num_cols:
    df_weather[col] = pd.to_numeric(df_weather[col])

df_weather = df_weather.rename(columns={'Date':'date'})
df_weather['date'] = pd.to_datetime(df_weather['date'], format='%d/%m/%Y').dt.date
df_weather = df_weather.drop(columns=['created_at','source', 'Station Name'])

df_weather.dtypes

In [None]:
df_crime_full = get_stream_to_pd(api='crime', station_id='95003', size=8000, radius_km=800, verb=True)

In [None]:
df_crime_full.head(3)

In [None]:
df_crime = df_crime_full.groupby(['reported_date', 'description_1'])['offence_count'].sum().reset_index()

df_crime = df_crime.rename(columns={'reported_date':'date'})
df_crime['date'] = pd.to_datetime(df_crime['date']).dt.date

df_crime.dtypes

In [None]:
df_crime.head(3)

In [None]:
df_weather.head(3)

In [None]:
df = pd.merge(df_weather, df_crime, on='date', how='inner')
df.shape

In [None]:
df.head(3)

## Linear Regression

In [None]:
df['description_1'].unique()

### On persons

In [None]:
df_pers = df[df['description_1'] == 'OFFENCES AGAINST THE PERSON']
df_pers = df_pers.drop(columns=['description_1', 'date'])
df_pers.head(4)

In [None]:
lin_model_pers = LinearRegression()
lin_model_pers.fit(df_pers.drop(columns='offence_count').values, df_pers['offence_count'])
pd.DataFrame({'Predictors': df_pers.columns[:-1], 'Coefficient': lin_model_pers.coef_})

In [None]:
sub_lin_model_pers = LinearRegression()
sub_lin_model_pers.fit(df_pers[['Evapo-Rain']].values, df_pers['offence_count'])

# plt.scatter(df_pers[['Evapo-Rain','offence_count']].values)
plt.scatter(df_pers['Evapo-Rain'],df_pers['offence_count'])

In [None]:
corr_table_pers = df_pers.corr(method='pearson')

plt.figure(figsize=(8, 6))
sns.heatmap(corr_table_pers, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
predictor = 'Evapo-Rain'  # 'UV', 'Min Temp', 'Max Temp', 'WindSpeed', 'Min Humid', 'Max Humid', 'Rain', 'Pan-Rain', 'Evapo-Rain'
r, p_value = pearsonr(df_pers[predictor], df_pers['offence_count'])
print(f'With a correlation of {r}, the p-value associated to H0:"There is no correlation" is {p_value}')

# Saving and Sending lin_model_perss to API

In [None]:
with open(f'linear_model_pers_{REGION}.pkl', 'wb') as f:
    pickle.dump(lin_model_pers, f)

# Send the pickle file to the API via POST request
'
files = {'model': open('linear_model.pkl', 'rb')}
response = requests.post(URL, headers=HEADERS, files=files)