In [36]:
import pandas as pd
import numpy as np

from utils import only_2016_data, clean_out_nan_heavy_rows, group_ages
from typing import List, Tuple

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv').set_index(['grunnkrets_id', 'year']).add_prefix('income_').reset_index()
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

train = pd.read_csv('data/stores_train.csv') #.drop(columns=['year'])
test = pd.read_csv('data/stores_test.csv') 

In [None]:
age_ranges = [
    (0, 19),
    (20, 39),
    (40, 59),
    (60, 79),
    (80, 90),
]


age_new = age[['grunnkrets_id', 'year']].drop_duplicates(subset=['grunnkrets_id'], keep='last')
for rng in age_ranges:
    cols = [f'age_{age}' for age in range(rng[0], rng[1] + 1)]
    rng_sum = age[cols].sum(axis=1).astype(int)
    age_new[f'age_{rng[0]}_{rng[-1]}'] = rng_sum

age = age.drop_duplicates(subset='grunnkrets_id').drop(columns=['year', *(f'age_{age}' for age in range(0, 91))], axis=1)
age = age.merge(age_new.drop(columns=['year']), on='grunnkrets_id')

In [37]:
train = train.merge(age, on='grunnkrets_id', how='left')
train.isna().sum()

store_id                   0
year                       0
store_name                 0
plaace_hierarchy_id        0
sales_channel_name         0
grunnkrets_id              0
address                 1774
lat                        0
lon                        0
chain_name              9122
mall_name              10579
revenue                    0
age_0_19                 805
age_20_39                805
age_40_59                805
age_60_79                805
age_80_90                805
dtype: int64

In [38]:
train = train.merge(households.drop(columns=['year']), on='grunnkrets_id', how='left')
train.head()
list(train)

['store_id',
 'year',
 'store_name',
 'plaace_hierarchy_id',
 'sales_channel_name',
 'grunnkrets_id',
 'address',
 'lat',
 'lon',
 'chain_name',
 'mall_name',
 'revenue',
 'age_0_19',
 'age_20_39',
 'age_40_59',
 'age_60_79',
 'age_80_90',
 'couple_children_0_to_5_years',
 'couple_children_18_or_above',
 'couple_children_6_to_17_years',
 'couple_without_children',
 'single_parent_children_0_to_5_years',
 'single_parent_children_18_or_above',
 'single_parent_children_6_to_17_years',
 'singles']

In [46]:
np.mean(train[age_ranges], axis=0)

KeyError: "None of [Index([(0, 19), (20, 39), (40, 59), (60, 79), (80, 90)], dtype='object')] are in the [columns]"

In [40]:
spatial_2016 = only_2016_data(spatial)
income_2016 = only_2016_data(income)
households_2016 = only_2016_data(households)

In [41]:
train_spatial = train.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
muni_avg_revenue = train_spatial.groupby(by='municipality_name', as_index=False)['revenue'].mean()
train_spatial = train_spatial.merge(muni_avg_revenue, on='municipality_name', how='left', suffixes=(None, '_muni_avg'))

train_spatial.head()

Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,...,single_parent_children_0_to_5_years,single_parent_children_18_or_above,single_parent_children_6_to_17_years,singles,grunnkrets_name,district_name,municipality_name,geometry,area_km2,revenue_muni_avg
0,983540538-974187930-44774,2016,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,...,0.0,0.0,8.0,78.0,Bragernes sentrum 3,Bragernes sentrum,Drammen,"POLYGON((10.2046156903846 59.7447808519649, 10...",0.155779,7.662207
1,983540538-974187930-44774,2016,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,...,4.0,6.0,11.0,75.0,Bragernes sentrum 3,Bragernes sentrum,Drammen,"POLYGON((10.2046156903846 59.7447808519649, 10...",0.155779,7.662207
2,987074191-973117734-44755,2016,MCDONALD'S KLINGENBERGGATA,1.1.1.0,Hamburger restaurants,3010306,,59.913759,10.734031,MCDONALDS,...,0.0,0.0,0.0,5.0,Sentrum 3 /rode 6,Sentrum 3,Oslo,"POLYGON((10.7303654475615 59.9107195782207, 10...",0.264278,8.103864
3,987074191-973117734-44755,2016,MCDONALD'S KLINGENBERGGATA,1.1.1.0,Hamburger restaurants,3010306,,59.913759,10.734031,MCDONALDS,...,0.0,0.0,0.0,6.0,Sentrum 3 /rode 6,Sentrum 3,Oslo,"POLYGON((10.7303654475615 59.9107195782207, 10...",0.264278,8.103864
4,984890265-981157303-64491,2016,BURGER KING HØNEFOSS,1.1.1.0,Hamburger restaurants,6050102,KONG RINGS GATE 1,60.164751,10.254656,BURGER KING,...,13.0,6.0,12.0,150.0,Sydsiden 2,Hønefoss,Ringerike,"POLYGON((10.2654039198422 60.1639238060368, 10...",0.160152,10.035593


In [42]:
def clean_out_nan_heavy_rows(df: pd.DataFrame):
    """Cleans out rows that have no match in the age, spatial, income or household datasets."""

    # df2 = df.merge(group_ages(age, age_ranges), on='grunnkrets_id', how='left')
    df2 = df.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

    df_cleaned = df2[
        ~(df2.age_0_19.isna() | df2.couple_children_0_to_5_years.isna() | df2.grunnkrets_name.isna() | df2.income_all_households.isna())
    ]

    print(f'Cleaned out {len(df) - len(df_cleaned)} out of {len(df)} rows.')

    return df_cleaned

In [43]:
train_uncleaned = train.merge(group_ages(age, age_ranges), on='grunnkrets_id', how='left')
train_uncleaned = train_uncleaned.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_uncleaned = train_uncleaned.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_uncleaned = train_uncleaned.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

train_cleaned = clean_out_nan_heavy_rows(train)

AttributeError: 'DataFrame' object has no attribute 'couple_children_0_to_5_years'

In [None]:
print(len(train_uncleaned), len(train_cleaned))
print(train_uncleaned.isna().sum())
print(train_cleaned.isna().sum())

In [None]:
train_df = train.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_df = train_df.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_df = train_df.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

assert len(train) == len(train_df)

train = train_df[
    ~(train_df.couple_children_0_to_5_years.isna() | train_df.grunnkrets_name.isna() | train_df.income_all_households.isna())
]





In [None]:
print(len(train), len(train_df))
train.isna().sum()

In [None]:
train_house_nans = train_house.grunnkrets_id[pd.isnull(train_house.couple_children_0_to_5_years)]
train_spatial_nans = train_spatial.grunnkrets_id[pd.isnull(train_spatial.grunnkrets_name)]
train_income_nans = train_income.income_all_households[pd.isnull(train_income.income_all_households)]

train_cleaned = train[
    ~(train.grunnkrets_id.isin(train_house_nans)) \
    | ~(train.grunnkrets_id.isin(train_spatial_nans)) \
    | ~(train.grunnkrets_id.isin(train_income_nans))
]

print(len(train), len(train_cleaned))
train_cleaned.isna().sum()

train.grunnkrets_id.isin(train_house_nans).value_counts()

In [None]:
def plot_corr(data):
  df = data[['revenue', 
    # 'age_0_19', 'age_20_39', 'age_40_59', 'age_60_79', 'age_80_90', 
    # 'bus_stops_count', 'Mangler viktighetsnivå', 'Standard holdeplass', 'Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt', 'Annen viktig holdeplass', 
    'dist_to_center', 'lat','lon'
    ]]
  df['knutepunkt'] = data[['Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt']].sum(axis=1)
  # df.revenue = np.exp(df.revenue)
  # df.bus_stops_count = np.sqrt(df.bus_stops_count)
  df = df[df.dist_to_center < 70_000]
  # df.dist_to_center = np.log(df.dist_to_center)
  
  plt.figure(figsize=(15, 15))
  pairplot = sns.pairplot(df)
  # heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)


# data_full =  pd.merge(X_train, y_train, left_index=True, right_index=True) 
# plot_corr(data_full)


In [None]:
train_full = train.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_full = train_full.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
train_full = train_full.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

train_full.isna().sum()