# Analysis of Data

### Import libraries and data

In [None]:
import pandas as pd
import geopandas as gpd
from collections import Counter
import re
from math import floor
import contextily as cx
from matplotlib import pyplot as plt
from paris_methods import get_change_over_years, create_grid, assign_gridnumber
from analysis import get_prof_str, plot_ratio_over_time, gif_for_professions,\
     jobs_not_before_after_specific_year, sort_by_number_of_words, plot_profession_selection_on_map

In [None]:
rich_data = pd.read_pickle("data/unique_aligned_tagged.pkl")
#convert to geopandas dataframe
rich_data = gpd.GeoDataFrame(rich_data, geometry=rich_data.geometry)

# make profession tags usable (from "['profession1', 'profession2']" to "profession1, profession2")
rich_data["tags"] = rich_data["tags"].apply(get_prof_str)

# alternative: rich_data.geometry.representative_point
rich_data["centroid"] = rich_data.geometry.centroid

## Preparation

In [None]:
#binning data: e.g. 1860-1869 -> 1860 (1860s, but in order to keep it an integer, the "s" is ommitted)
rich_data["annee_bin"] = pd.cut(rich_data["annee"], right=False,
                bins=[1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900, 1910, 1920, 1930],
                labels=[1839, 1840, 1850, 1860, 1870, 1880, 1890, 1900, 1910, 1920])
rich_data["annee_bin"].hist(bins=[1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900, 1910, 1920, 1930])

In [None]:
# get only jobs with frequency in dataset higher than 50
freq_jobs = [metier for metier, count in Counter(rich_data["tags"]).items() if count>50]
print("number of frequent jobs:", len(freq_jobs))

# see how many rows include frequent jobs
print("all rows:", len(rich_data))
freq_job_data = rich_data[rich_data["tags"].isin(freq_jobs)]
print("only rows with frequent jobs:", len(freq_job_data))

In [None]:
# check how the selection of frequent jobs puts bias in data

# create dataframe with absolute frequency of professions in rich_data and freq_job_data
count_year = rich_data.groupby(by="annee").count()[["tags"]]
count_year_freq = freq_job_data.groupby(by="annee").count()[["tags"]]
both_freq = count_year.join(count_year_freq, lsuffix="_all")

# this plot shows that the omitting of jobs with frequency <50 is affecting all years evenly
both_freq.plot.bar(y=["tags_all", "tags"], figsize=(10,6))

## Analysis on Professions

In [None]:
# compare raw profession data and tags
print("Most frequent jobs in profession raw data:", Counter(rich_data["metier"]).most_common(10))
print("Most frequent jobs in tagged data:", Counter(rich_data["tags"]).most_common(10))

In [None]:
#get ratio of top 10 jobs in dataset
top_jobs10 = Counter(freq_job_data["tags"]).most_common(10)
top_jobnames10 = [name for name, count in top_jobs10]
plot_ratio_over_time(rich_data, top_jobnames10, title="Development top 10 jobs in dataset")

In [None]:
# get distribution over the years for people with professions to do with food
gif_for_professions(rich_data, ["boucher", "boulanger", "épicier", "charcutier"], "food", geo_col="geometry")

In [None]:
# get distribution over the years for people with property
gif_for_professions(rich_data, ["rentier", "propriétaire"], "housing", geo_col="geometry")

### Professions which were born or died out

In [None]:
# get jobs which did not exist eihter before or after 1880 
not_after1880, not_before1880 = jobs_not_before_after_specific_year(freq_job_data, 1880)

# get only the one-worded tags
not_bef1880_one, _, _ = sort_by_number_of_words(not_before1880)
not_aft1880_one, _, _ = sort_by_number_of_words(not_after1880)
print("new profession entries - only after 1880 (one word entries):\n", not_bef1880_one, 
        "\n\nold profession entries - only before 1880 (one word entries):\n", not_aft1880_one)

In [None]:
# plot the development over time
plot_ratio_over_time(rich_data, not_bef1880_one+not_aft1880_one, 
            title="Jobs which died out before or appeared after 1880")

In [None]:
# create a gif with the development of the new mobility professions throughout paris
gif_for_professions(rich_data, ["automobiles", "garage", "cycles", "bicyclettes"], "mobility", geo_col="geometry")

## Street statistics

In [None]:
#look at most frequent streets
top_streets20 = Counter(rich_data["streetname"]).most_common(20)
top_streets20 = [name for name, count in top_streets20]

plot_ratio_over_time(rich_data, top_streets20, col_name="streetname",
        title="Development of most frequent streets in dataset")

### Looking at one street

In [None]:
# looking at the job development in the street "Saint Honoré"
honore = rich_data[rich_data["streetname"]=="Rue Saint Honoré"]
honore_jobs = Counter(honore["tags"]).most_common(10)
honore_jobs = [name for name, count in honore_jobs]

plot_ratio_over_time(honore, honore_jobs, title="Top job development in street 'Rue Saint Honoré'")

In [None]:
# looking at the job development in the street "Boulevard Voltaire"
voltaire = rich_data[rich_data["streetname"]=="Boulevard Voltaire"]
voltaire_jobs = Counter(voltaire["tags"]).most_common(10)
voltaire_jobs = [name for name, count in voltaire_jobs]

plot_ratio_over_time(voltaire, voltaire_jobs, title="Top job development in street 'Boulevard Voltaire'")

In [None]:
voltaire.head(1).plot()
plt.xlim(250000, 270000)
plt.ylim(6244000, 6258000)
cx.add_basemap(plt.gca())

# Gridwork

In [None]:
# Construct grid and assign gridnumbers
gridsize = 4
gridX, gridY = create_grid(gridsize, gridsize,rich_data)
assign_gridnumber(rich_data, gridX, gridY)

# Assign gridnumbers to streets too (makes dataViz quicker)
FinalUnique = pd.read_pickle("data/FinalUnique.pkl")
FinalUnique = gpd.GeoDataFrame(FinalUnique, geometry= "geometry")
FinalUnique = FinalUnique.drop(columns=["buffer", "filter"])
FinalUnique["centroid"] = FinalUnique.centroid
assign_gridnumber(FinalUnique, gridX, gridY)

# Show grid
matrix = FinalUnique.plot(column='grid',cmap='gist_rainbow')
cx.add_basemap(matrix, crs=FinalUnique.crs.to_string())


In [None]:
# Used to construct yearly barplots indicating change in region
groupby = "annee"
profession_change_regions = []
profession_change_years = []
# change in all of paris
change_all, years = get_change_over_years(rich_data, yearcolumn= groupby)
profession_change_regions.append(change_all)
profession_change_years.append(years)

for i in range(1,gridsize**2+1):
    # change in all other subgrids
    subset = rich_data.loc[rich_data.grid == i]
    change_in_grid, years = get_change_over_years(subset, yearcolumn= groupby)
    profession_change_regions.append(change_in_grid)
    profession_change_years.append(years)



In [None]:
# Used to construct yearly barplots indicating change in region
groupby = "annee_bin"
profession_change_regions_bin = []
profession_change_years_bin = []

# change in all of paris
change_all, years = get_change_over_years(rich_data, yearcolumn= groupby)
profession_change_regions_bin.append(change_all)
profession_change_years_bin.append(years)

for i in range(1,gridsize**2+1):
    # change in all other subgrids
    subset = rich_data.loc[rich_data.grid == i]
    change_in_grid, years = get_change_over_years(subset, yearcolumn = groupby)
    profession_change_regions_bin.append(change_in_grid)
    profession_change_years_bin.append(years)



In [None]:
fig, axs = plt.subplots(gridsize,gridsize, sharex=True, sharey=True)
for i in range(1,gridsize**2+1):
    x = gridsize - floor((i-1)/gridsize)-1
    y = (i-1)%gridsize
    axs[x][y].bar(profession_change_years[i],profession_change_regions[i],width=2)
    axs[x][y].set_title(f"{i}")    
fig.text(-0.02, 0.5, 'change indication', va='center', rotation='vertical')
plt.suptitle("Changes in professional mix-up per grid-spot")
plt.tight_layout()
plt.show()


In [None]:

plt.bar(profession_change_years_bin[0], profession_change_regions_bin[0], width=8)
plt.title("Overall change in professional mix-up in Paris")
plt.ylabel("change indicator")
