In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_gbq import read_gbq
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.stattools import adfuller
from google.cloud import storage

In [2]:
project_id = "wagon-bootcamp-428814"

query = """
SELECT 
    `Date`,
    `Community Area`,
    COUNT(*) AS crime_count
FROM 
    `wagon-bootcamp-428814.chicago_crime.chicago_crime_tab`
WHERE 
    `Community Area` IN ('40', '43')
GROUP BY 
    `Date`, `Community Area`
ORDER BY 
    `Date`, `Community Area`
"""

df = pd.read_gbq(query, project_id=project_id, dialect='standard')

df.head()


  df = pd.read_gbq(query, project_id=project_id, dialect='standard')


Unnamed: 0,Date,Community Area,crime_count
0,2001-01-01 00:00:00,40,1
1,2001-01-01 00:00:00,43,3
2,2001-01-01 00:01:00,40,1
3,2001-01-01 00:01:00,43,1
4,2001-01-01 01:00:00,40,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299033 entries, 0 to 299032
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Date            299033 non-null  datetime64[us]
 1   Community Area  299033 non-null  object        
 2   crime_count     299033 non-null  Int64         
dtypes: Int64(1), datetime64[us](1), object(1)
memory usage: 7.1+ MB


In [4]:
df.describe()

Unnamed: 0,Date,crime_count
count,299033,299033.0
mean,2011-11-25 13:23:23.237033,1.093592
min,2001-01-01 00:00:00,1.0
25%,2006-06-05 21:00:00,1.0
50%,2010-12-24 09:45:00,1.0
75%,2017-02-16 09:00:00,1.0
max,2024-08-12 00:00:00,19.0
std,,0.358469


In [5]:
df.columns

Index(['Date', 'Community Area', 'crime_count'], dtype='object')

In [6]:
df.shape

(299033, 3)

In [7]:
df.isnull().sum()

Date              0
Community Area    0
crime_count       0
dtype: int64

In [8]:
df['Community Area'].value_counts()

Community Area
43    222660
40     76373
Name: count, dtype: int64

In [9]:
le_community_area = LabelEncoder()

In [10]:
df['Community Area Encoded'] = le_community_area.fit_transform(df['Community Area'])
df

Unnamed: 0,Date,Community Area,crime_count,Community Area Encoded
0,2001-01-01 00:00:00,40,1,0
1,2001-01-01 00:00:00,43,3,1
2,2001-01-01 00:01:00,40,1,0
3,2001-01-01 00:01:00,43,1,1
4,2001-01-01 01:00:00,40,1,0
...,...,...,...,...
299028,2024-08-11 21:30:00,40,1,0
299029,2024-08-11 22:00:00,43,1,1
299030,2024-08-11 22:14:00,40,1,0
299031,2024-08-11 23:30:00,43,1,1


In [12]:
df['Formatted Date'] = df['Date'].dt.strftime('%Y-%m-%d')

In [13]:
df = df.drop(['Date', 'Community Area'], axis=1)

In [14]:
df.head()

Unnamed: 0,crime_count,Community Area Encoded,Formatted Date
0,1,0,2001-01-01
1,3,1,2001-01-01
2,1,0,2001-01-01
3,1,1,2001-01-01
4,1,0,2001-01-01


In [None]:
# df.to_csv('cleaned_data.csv', index=False)

In [None]:
# Initialize a client
client = storage.Client()

# Specify the bucket name
bucket_name = 'your-bucket-name'
bucket = client.get_bucket(bucket_name)

# Specify the destination blob name (the file name in the cloud)
blob_name = 'path/to/directory/cleaned_data.csv'  # Change the path as needed
blob = bucket.blob(blob_name)

# Upload the file
blob.upload_from_filename('cleaned_data.csv')

print("File uploaded to Google Cloud Storage successfully.")

In [None]:
blobs = bucket.list_blobs()

for blob in blobs:
    print(blob.name)