# Prepare companies Sirene registry

## Imports

In [9]:
%run "../config/notebook.ipynb"
%run "../config/files.ipynb"
%run "../utils/stop_execution.ipynb"

# import communes of the Haute-Garonne
%run "../prepare/prepare_communes_hautegaronne.ipynb"

import pandas as pd
import numpy as np

df_communes_haute_garonne: 586 communes


## Read from cache

In [10]:
if 'FROM_CACHE' in locals() or 'FROM_CACHE' in globals():
    if FROM_CACHE:
        df_sirenes = pd.read_csv(PREPARED_SIRENES_HAUTEGARONNE_FILE_PATH, index_col=['postalcode', 'siret'])
        print("(from cache) df_sirenes: {} companies".format(len(df_sirenes.index)))        
        raise StopExecution

## Create the sirene dataframe

 ### Read the sirene csv file

In [11]:
# keep only columns
#  0   siren                          int64  
#  2   siret                          int64  
#  4   dateCreationEtablissement      object 
#  16   trancheEffectifsEtablissement  object 
#  20   codePostalEtablissement        float64
#  5   codeCommuneEtablissement        object
df_sirenes = pd.read_csv(COMPANIES_STATISTICS_FILE_PATH, usecols=[0,2,4,16,20,5], \
                         dtype={'siren': np.int64, \
                               'siret': np.int64, \
                               'dateCreationEtablissement': np.str, \
                               'trancheEffectifsEtablissement': np.str, \
                               'codePostalEtablissement': np.str, \
                               'codeCommuneEtablissement': np.str})

### Rename columns

In [12]:
df_sirenes.columns = ['siren', 'siret', 'creation_date', 'staffing_range', 'postalcode', 'commune_code']

### Add a creation_year column

In [13]:
df_sirenes['year'] = df_sirenes['creation_date'].apply(lambda d: str(d).split('-')[0])

### Filter on Haute-Garonne communes

In [14]:
# set temporarily the index on commune_code
df_sirenes = df_sirenes.set_index('commune_code')

In [15]:
# join with the Haute-Garonne communes
df_sirenes = df_sirenes.join(df_communes_haute_garonne, how='inner')

In [16]:
# restore a no-index state
df_sirenes = df_sirenes.reset_index()

### Remove unused columns

In [19]:
df_sirenes = df_sirenes.drop(['index'], axis=1)

### Set the index

In [None]:
df_sirenes = df_sirenes.set_index(['postalcode', 'siret'])

### Save the dataframe

In [20]:
df_sirenes.to_csv(PREPARED_SIRENES_HAUTEGARONNE_FILE_PATH)
print("df_sirenes: {} companies".format(len(df_sirenes.index)))

df_sirenes: 598958 companies
