In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("crawled_data.csv")
data.info()

In [None]:
# --- FEATURE SELECTION ---
necessary_columns = [
    'ID', 'Case Number', 'Date', 'Primary Type', 'Description',
    'Location Description', 'Arrest', 'Domestic', 'District',
    'Ward', 'Community Area', 'Year', 'Month', 'Latitude', 'Longitude'
]
cleaned_data = data[necessary_columns]

In [None]:
# --- DROP UNECCESSARY VALUES IN PRIMARY TYPE ---

typeToDrop = {
    'NON-CRIMINAL',
    'OTHER OFFENSE',
    'OTHER NARCOTIC VIOLATION',
    'OBSCENITY',
    'PUBLIC INDECENCY',
    'CONCEALED CARRY LICENSE VIOLATION',
    'LIQUOR LAW VIOLATION',
    'GAMBLING',
    'RITUALISM',
}
cleaned_data = cleaned_data[~cleaned_data['Primary Type'].isin(typeToDrop)]
cleaned_data.shape

In [None]:
# --- ADD THE COUNT OF PRIMARY TYPE INTO THE DATAFRAME ---
cleaned_data["primary_type_count"] = cleaned_data.groupby("Primary Type")["Primary Type"].transform("count")
cleaned_data["primary_type_count"]

In [None]:
# --- ADD THE TOTAL CRIMES PER YEAR ---

cleaned_data["total_crimes_per_yr"] = cleaned_data.groupby("Year")["Year"].transform("count")
cleaned_data["total_crimes_per_yr"]

In [None]:
# --- ADDING MONTHLY TOTAL ---

monthly_counts = cleaned_data.groupby(["Year", "Month"]).size().reset_index(name="monthly_crime_count")
cleaned_data = cleaned_data.merge(monthly_counts, on=["Year", "Month"], how="left")
cleaned_data["monthly_crime_count"] = cleaned_data["monthly_crime_count_y"]
cleaned_data.drop(columns=["monthly_crime_count_x", "monthly_crime_count_y"], inplace=True)
cleaned_data

In [None]:
# Function to generalize all the specific location description
def generalize_location(loc_desc):
  # Handle none values
  if pd.isna(loc_desc):
    return "Other"
  loc_desc = str(loc_desc).upper()
  # Residential
  if any(keyword in loc_desc for keyword in ["RESIDENCE", "APARTMENT", "HOUSE", "CHA APARTMENT", "RESIDENTIAL", "PORCH", "YARD", "GARAGE", "BASEMENT", "VESTIBULE", "HALLWAY"]):
      return "Residential"
  # Public Transportation
  elif any(keyword in loc_desc for keyword in ["CTA", "TRAIN", "BUS", "SUBWAY", "PLATFORM", "STATION", "RAILROAD"]):
      return "Public Transportation"
  # Commercial
  elif any(keyword in loc_desc for keyword in ["STORE", "RESTAURANT", "BAR ", "TAVERN", "HOTEL", "MOTEL", "BANK", "CURRENCY EXCHANGE", "GAS STATION", "OFFICE", "WAREHOUSE", "FACTORY", "LIQUOR", "BARBER", "BEAUTY SALON", "PAWN SHOP", "ATM", "CASINO"]):
      return "Commercial"
  # Street/Outdoor
  elif any(keyword in loc_desc for keyword in ["STREET", "SIDEWALK", "ALLEY", "PARKING LOT", "VACANT LOT", "PARK PROPERTY", "LAKEFRONT", "RIVERBANK", "FOREST PRESERVE", "BRIDGE", "HIGHWAY", "GANGWAY", "DRIVEWAY"]):
      return "Street/Outdoor"
  # Institutional
  elif any(keyword in loc_desc for keyword in ["SCHOOL", "HOSPITAL", "CHURCH", "SYNAGOGUE", "GOVERNMENT", "LIBRARY", "POLICE", "FIRE STATION", "NURSING HOME", "DAY CARE", "COLLEGE", "UNIVERSITY"]):
      return "Institutional"
  # Airport
  elif any(keyword in loc_desc for keyword in ["AIRPORT", "AIRCRAFT"]):
      return "Airport"
  # Vehicle
  elif any(keyword in loc_desc for keyword in ["VEHICLE", "AUTO", "TAXICAB", "UBER", "LYFT", "TRUCK", "BOAT"]):
      return "Vehicle"
  # Other
  else:
      return "Other"


cleaned_data["generalized_loc"] = cleaned_data["Location Description"].apply(generalize_location)
cleaned_data

In [None]:
# cleaned_data.drop(columns=["Location Description"], inplace=True)
cleaned_data

In [None]:
# --- TOTAL COUNT BY LONGITUDE AND LATTITUDE ---

cleaned_data["Lat_round"] = cleaned_data["Latitude"].round(3)
cleaned_data["Lon_round"] = cleaned_data["Longitude"].round(3)
location_counts = cleaned_data.groupby(["Lat_round", "Lon_round"]).size().reset_index(name="location_crime_count")
cleaned_data = cleaned_data.merge(location_counts, on=["Lat_round", "Lon_round"], how="left")
cleaned_data

In [None]:
# cleaned_data.drop(columns=["Latitude", "Longitude"], inplace=True)
cleaned_data.to_csv("cleaned_crime_data.csv", index=False)
# cleaned_data