# Etudes des valeurs foncières Française
### Liens: https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/

In [None]:
# Gather Latitude Longitude from Google Map (time/cost consuming)
add_lat_long = False

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import glob
from datetime import datetime

#### Chargement du fichiers des valeurs foncières

In [None]:
dateparse = lambda x: datetime.strptime(x, '%d/%m/%Y') # 03/01/2018
def_types = {'Code postal': 'category', 'No voie': 'Int64'}

all_files = glob.glob("data/*.txt")
df_val_fonc = pd.concat((pd.read_csv(f, sep='|', decimal=",", dtype=def_types, parse_dates=['Date mutation'], date_parser=dateparse) for f in all_files))
#df = pd.read_csv('data/valeursfoncieres-2018.txt', sep='|', decimal=",", dtype=def_types, parse_dates=['Date mutation'], date_parser=dateparse)

In [None]:
file_insee = 'data/correspondance-code-insee-code-postal.csv'
df_commune = pd.read_csv(file_insee, sep=';', decimal=".")

In [None]:
df = pd.merge(df_val_fonc, df_commune, how='left', left_on=['Code postal'], right_on=['Code Postal'])

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.dtypes.value_counts()

#### Renomme les colonnes

In [None]:
import unidecode

columns = {}
for index, column_name in enumerate(df.columns):
    columns[column_name] = unidecode.unidecode(column_name.replace(' ','_').replace('\'','_').lower())
df.rename(columns=columns, inplace=True)

## Nettoyage des données simple

In [None]:
def missing_values_assessment(df):
    nan_values = df.isnull().sum().sum()
    print('Nombre d\'observations: {:,}'.format(len(df)))
    print('Nombre de valeurs: {:,}'.format(df.size))
    print('Valeurs manquantes: {:,}'.format(nan_values))
    print('Qualité des données: {}%'.format(100-round((nan_values/df.size)*100,2)))
    print('Type de données:\n {}%'.format(df.dtypes.value_counts()))
    analysis = {'Manquant': df.isnull().sum(),
                'Manquant %':round((df.isnull().sum()/len(df))*100, 2),
                'Type':df.dtypes
               }
    return pd.DataFrame(analysis)

#### Analyse des données manquantes

In [None]:
missing_values_assessment(df).sort_values('Manquant %', ascending=False)

#### Supression des colonnes sans données

In [None]:
# Drop NaN columns
orig_col = df.columns
df.dropna(how='all', axis=1, inplace=True)
print("Suppression de {} colonnes sans données:".format(len(orig_col)-len(df.columns)))
print([item for item in orig_col if item not in df.columns])

#### Suppression des colonnes sans intérêt pour notre étude

In [None]:
#df.drop(['voie', 'b/t/q'],axis=1, inplace=True)

# Référence cadastrale de la parcelle
df.drop(['prefixe_de_section', 'section', 'no_plan', 'no_volume'],axis=1, inplace=True) # Garde 'Code commune'

# No de disposition
df.drop(['no_disposition'],axis=1, inplace=True)

# Nature culture (référence à un document externe)
#df.drop(['nature_culture_speciale', 'nature_culture'],axis=1, inplace=True)

# Suppression du 'Type local' en doublon avec le 'Code Type Local'
df.drop(['code_type_local'],axis=1, inplace=True)

# Suppression du 'Code voie'
df.drop(['code_voie'],axis=1, inplace=True)

# Suppression du 'No voie'
#df.drop(['no_voie'],axis=1, inplace=True)

# Suppression de la 'Date mutation'
#df.drop(['date_mutation'],axis=1, inplace=True)

# Suppression de la 'Code commune' en doublon avec la categorie 'Commune'
#df.drop(['code_commune'],axis=1, inplace=True)
# Suppression de 'Commune' en doublon avec Commune
#df.drop(['code_postal'],axis=1, inplace=True)

# Suppression des 'lot'
df.drop(['5eme_lot'],axis=1, inplace=True)
df.drop(['4eme_lot'],axis=1, inplace=True)
df.drop(['3eme_lot'],axis=1, inplace=True)
df.drop(['2eme_lot'],axis=1, inplace=True)
df.drop(['1er_lot'],axis=1, inplace=True)

In [None]:
# Données Insee
df.drop(['code_insee'],axis=1, inplace=True)

In [None]:
# Suppression des lignes où il n'y a pas de 'valeurs foncières'
indexNames = df[df['valeur_fonciere'].isnull()].index
df.drop(indexNames, inplace=True)

In [None]:
# Remplissage des mètres Carrez manquant avec zéro
#df = df[df['Nombre de lots']<=5].dropna(how='all', axis=1)
df['surface_carrez_du_5eme_lot'].fillna(0, inplace=True)
df['surface_carrez_du_4eme_lot'].fillna(0, inplace=True)
df['surface_carrez_du_3eme_lot'].fillna(0, inplace=True)
df['surface_carrez_du_2eme_lot'].fillna(0, inplace=True)
df['surface_carrez_du_1er_lot'].fillna(0, inplace=True)
df['surface_reelle_bati'].fillna(0, inplace=True)
df['surface_terrain'].fillna(0, inplace=True)

#### Suppression des lignes en double

In [None]:
df[df.duplicated(keep=False)]

In [None]:
# Drop duplicates
initial_length = len(df)
df.drop_duplicates(inplace=True)
print("Suppression des doublons: {}".format(initial_length-len(df)))

#### Analyse graphique des données manquantes

In [None]:
msno.bar(df, sort='ascending')

In [None]:
msno.matrix(df, sort='ascending')

## Nettoyage de données fonctionnelles

In [None]:
## Formatage du code postal
df.code_postal=df.code_postal.apply(lambda x: '0{}'.format(x) if len(str(x))==4 else x)

In [None]:
category_cols = df.select_dtypes(include=['object','category'])

In [None]:
category_cols

In [None]:
for cat in category_cols.columns:
    cat_val = df[cat].unique()
    print('{} ({}):'.format(cat,len(cat_val)))
    print('{}\n'.format(cat_val))

### Catégories

In [None]:
#df = pd.get_dummies(df, columns=['nature_mutation'])
#df = pd.get_dummies(df, columns=['type_de_voie'])

### Feature engineering Address

In [None]:
df['address']=(df['no_voie'].map(str).replace('nan','') +  ' ' + \
    df['b/t/q'].fillna('') +  ' ' + \
    df['type_de_voie'].fillna('') +  ' ' + \
    df['voie'].fillna('') +  ', ' + \
    df['code_postal'].astype('str').replace('nan','') +  ' ' + \
    df['commune'].fillna('')).str.strip()

In [None]:
# Drop the columns use to 
df.drop(['no_voie', 'b/t/q', 'voie', 'code_postal'],axis=1, inplace=True)

In [None]:
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import GoogleV3

cache_file = 'data/cache.dat'

def save_locations(df):
    df.to_csv(cache_file, index = None, header=True)

def load_locations():
    try:
        return pd.read_csv(cache_file)
    except:
        print('{} doesn\'t exist or is empty - creating a new dataframe'.format(cache_file))
        return pd.DataFrame(columns=['latitude','longitude','address', 'address_unformatted'])

with open('google_api_key.txt') as f:
    google_api_key = f.readline()

def add_locations(df, location, address):
    return df.append({'latitude':location.latitude,
               'longitude':location.longitude,
               'address':location.address,
                'address_unformatted':address},
              ignore_index=True)
    
def get_geocode(df, address):
    location = df[df['address_unformatted']==address]
    if not location.empty :
        return df, location
    #print('Address not in cache, requesting GeoPy')
    geopy = GoogleV3(api_key=google_api_key)
    try:
        location = geopy.geocode(address)
        df = add_locations(df, location, address)
    except Exception as e:
        print('Exception on address: {}'.format(address))
        traceback.print_exc()
    return df, location

In [None]:
if add_lat_long: 
    df['latitude'] = np.nan
    df.latitude = df.latitude.astype('float')
    df['longitude'] = np.nan
    df.longitude = df.longitude.astype('float')
    df['address_formatted'] = np.nan

In [None]:
if add_lat_long:
    for index, row in df.iterrows():
        address = row['address']
        #print('Searching address:{}'.format(row['address']))
        geolocation_df, location = get_geocode(geolocation_df, address)
        try:
            if location is not None:
                df.at[index, 'latitude'] = location.latitude
                df.at[index, 'longitude'] = location.longitude
                df.at[index, 'address_formatted'] = location.address
        except Exception as e:
            print('Exception on address [{}]: {}'.format(index, row['address']))
            traceback.print_exc()
        if index%100==0:
            print('Saving - index: {}'.format(index))
            save_locations(geolocation_df)

In [None]:
missing_values_assessment(df).sort_values('Manquant %', ascending=False)

## Save the file

In [None]:
df.to_csv(r'data/valeursfoncieres-clean-df.csv', index = None, header=True)

In [None]:
df['code_departement'] = df['code_departement'].astype(str)
df["code_commune"] = df['code_commune'].astype(str)
df['code_departement'] = df['code_departement'].apply(lambda x: '0'+x if len(x)==1 else x)
df['code_commune'] = df['code_commune'].apply(lambda x: x+'0' if len(x)==2 else x)
df['code_commune'] = df['code_commune'].apply(lambda x: x+'00' if len(x)==1 else x)

In [None]:
df["code_commune_long"] = df["code_departement"] + df["code_commune"]

In [None]:
df['code_commune_long'].sort_values

In [None]:
from numpy import savetxt
code_commune_long = df['code_commune_long'].unique()

In [None]:
savetxt('data/code_commune.csv', code_commune_long, delimiter=',', fmt='"%s"')