# Analysis of air pollution data from Europe

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import kurtosis
from scipy.integrate import odeint
import statistics
from scipy.stats import lognorm
import math
import string
import sdeint
from joblib import Parallel, delayed 
import multiprocessing
from scipy.optimize import curve_fit
import pandas as pd
import seaborn as sns
from scipy import optimize
from scipy.stats import expon
import matplotlib.ticker as mtick
from scipy.stats import rv_continuous
from sklearn.metrics import mean_squared_error, r2_score 
from sympy import *
from sympy import summation, symbols, solve, Function, Sum, log
from numpy import random
from mpl_toolkits.axes_grid.inset_locator import (inset_axes, InsetPosition,mark_inset)
from collections import Counter
import os

## 1.) set up functions

In [None]:
#function for the q-exponential function
def q_exp_func(x,q,l):
    #q = q_value
    return (2 - q) * l * np.sign(1 + (q- 1) * l * x)* (np.abs(1 + (q- 1) * l * x))**( 1/(1 - q))

#function for the exponential function
def exp_func(x,l):
    return l*np.exp(-l*x)   

#class for generating instances of the q-exponential function
class qExp_gen(rv_continuous): 
    "q exponential"
    #pdf defines the pdf
    def _pdf(self, x, l , q):
        self.l=l;
        self.q=q  
        if q==1: 
            return exp_func(x,l)
        else:
            return q_exp_func(x,q,l)

    def _stats(self,l, q):
        return [self.l,self.q,0,0]
    #fitstart provides a starting point for any MLE fit
    def _fitstart(self,data):
        return (1.1,1.1)
    #argcheck tests that the parameters are meaningful
    def _argcheck(self, l, q):
        return (l>0)&(q>0)
    
# check if a string is NAN   
def isNaN(string):
    return string != string

## 2.) set up dataframe

In [None]:
#read sites meta
europe_meta=pd.read_csv('data_sites.csv')
# read dataframe
df=pd.read_csv('European area type.csv')
no_statistics_df=pd.read_csv('europe NO mean versus standard deviation.csv')
no2_statistics_df=pd.read_csv('europe NO2 mean versus standard deviation.csv')
pm25_statistics_df=pd.read_csv('europe PM2.5 mean versus standard deviation.csv')
pm10_statistics_df=pd.read_csv('europe PM10 mean versus standard deviation.csv')
no_qvalue_df=pd.read_csv('no q,lamda dataframe.csv')
no2_qvalue_df=pd.read_csv('no2 q,lamda dataframe.csv')
pm25_qvalue_df=pd.read_csv('pm2.5 q,lamda dataframe.csv')
pm10_qvalue_df=pd.read_csv('pm10 q,lamda dataframe.csv')


## 3) scatter plot of mean versus std

In [None]:
# importing required libraries
from matplotlib import gridspec

# create a figure
fig = plt.figure()

# to change size of subplot's
fig.set_figheight(15)
fig.set_figwidth(15)

# create grid for different subplots
spec = gridspec.GridSpec(ncols=2, nrows=2,
						width_ratios=[1, 1], wspace=0,
						hspace=0, height_ratios=[1, 1])

no_mean_list = no_statistics_df['mean']
no_std_list = no_statistics_df['std'] 
no2_mean_list = no2_statistics_df['mean']
no2_std_list = no2_statistics_df['std'] 
pm25_mean_list = pm25_statistics_df['mean']
pm25_std_list = pm25_statistics_df['std'] 
pm10_mean_list = pm10_statistics_df['mean']
pm10_std_list = pm10_statistics_df['std'] 

# ax0 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[0]"
ax0 = fig.add_subplot(spec[0])
sns.scatterplot(no_std_list, no_mean_list,data=no_statistics_df,hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],ax=ax0,linewidth=0.3 ,alpha=0.9)
ax0.set_title('NO', y=-0.01,fontsize='19')
ax0.set_ylim(0,155)
ax0.set_xlim(0,155)
ax0.plot(np.arange(155),linewidth=0.4,color='black',linestyle='--')
ax0.xaxis.label.set_visible(False)
ax0.yaxis.label.set_visible(False)
plt.yticks(fontsize=17)
plt.tick_params(bottom = False)
plt.xticks([])
# ax1 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[1]"
ax1 = fig.add_subplot(spec[1])
sns.scatterplot(no2_std_list, no2_mean_list,data=no2_statistics_df,hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],ax=ax1,linewidth=0.3 ,alpha=0.9)
ax1.set_title('NO2', y=-0.01,fontsize='19')
ax1.plot(np.arange(155),linewidth=0.4,color='black',linestyle='--')
ax1.set_ylim(0,155)
ax1.set_xlim(0,155)
ax1.yaxis.label.set_visible(False)
ax1.xaxis.label.set_visible(False)
plt.tick_params(left = False)
plt.yticks([])
plt.tick_params(bottom = False)
plt.xticks([])
# ax2 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[2]"
ax2 = fig.add_subplot(spec[2])
sns.scatterplot(pm25_std_list, pm25_mean_list,data=pm25_statistics_df,hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],ax=ax2,linewidth=0.3 ,alpha=0.9 )
ax2.set_title('PM2.5', y=-0.01,fontsize='19')
ax2.plot(np.arange(155),linewidth=0.4,color='black',linestyle='--')
plt.yticks(fontsize=17)
plt.xticks(fontsize=17)
ax2.set_ylim(0,155)
ax2.set_xlim(0,155)
ax2.yaxis.label.set_visible(False)
ax2.xaxis.label.set_visible(False)
# ax3 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[3]"
ax3 = fig.add_subplot(spec[3])
sns.scatterplot(pm10_std_list, pm10_mean_list,data=pm10_statistics_df,hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],ax=ax3,linewidth=0.3 ,alpha=0.9)
ax3.set_title('PM10', y=-0.01,fontsize='19')
ax3.plot(np.arange(155),linewidth=0.4,color='black',linestyle='--')
ax3.set_ylim(0,155)
ax3.set_xlim(0,155)
ax3.yaxis.label.set_visible(False)
ax3.xaxis.label.set_visible(False)
plt.xticks(fontsize=17)
plt.tick_params(left = False)
plt.yticks([])
fig.text(0.5, 0.04, 'Standard deviation', ha='center',fontsize="21")
fig.text(0.04, 0.5, 'Mean', va='center', rotation='vertical',fontsize='21')

ax0.text(0.05, 0.9,('a'), transform=ax0.transAxes, size=28)    
ax1.text(0.05, 0.9,('b'), transform=ax1.transAxes, size=28)
ax2.text(0.05, 0.9,('c'), transform=ax2.transAxes, size=28)
ax3.text(0.05, 0.9,('d'), transform=ax3.transAxes, size=28)

ax0.get_legend().remove()
ax1.get_legend().remove()
ax2.get_legend().remove()
ax3.get_legend().remove()
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles=handles[0:], labels=labels[0:],loc='upper center', bbox_to_anchor=(0, 1.25),ncol=3, fancybox=True, shadow=True,fontsize='19')
plt.savefig('mean vs std fig.pdf',bbox_inches='tight')

plt.show()


## 4.) scatter plot of q-value versus lambda between various site types

In [None]:
# importing required libraries
from matplotlib import gridspec

# create a figure
fig = plt.figure()

# to change size of subplot's
fig.set_figheight(13)
fig.set_figwidth(20)

# create grid for different subplots
spec = gridspec.GridSpec(ncols=2, nrows=2,
						width_ratios=[1, 1], wspace=0,
						hspace=0, height_ratios=[1, 1])


# ax0 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[0]"
ax0 = fig.add_subplot(spec[0])
qlist = no_qvalue_df['q'] 
llist = no_qvalue_df['l'] 
llist_rec = [1/x for x in llist]
sns.scatterplot(llist,qlist, data=no_qvalue_df, hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'], ax=ax0,s=30,linewidth=0.3 ,alpha=0.9)
ax0.set_xscale('log')
ax0.set_xlim(left=0.003, right=150)
ax0.grid(False)
ax0.set_ylim(bottom=0.5, top=1.7)
ax0.xaxis.label.set_visible(False)
ax0.yaxis.label.set_visible(False)
ax0.set_title('NO', y=-0.01,fontsize='23')
plt.yticks(fontsize=22)
plt.tick_params(bottom = False)
plt.xticks([])

# ax1 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[1]"
ax1 = fig.add_subplot(spec[1])
qlist = no2_qvalue_df['q'] 
llist = no2_qvalue_df['l'] 
llist_rec = [1/x for x in llist]
sns.scatterplot(llist,qlist, data=no2_qvalue_df, hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'], ax=ax1,s=30,linewidth=0.3 ,alpha=0.9)
ax1.set_xscale('log')
ax1.set_xlim(left=0.003, right=150)
ax1.grid(False)
ax1.set_ylim(bottom=0.5, top=1.7)
ax1.xaxis.label.set_visible(False)
ax1.yaxis.label.set_visible(False)
ax1.set_title('NO2', y=-0.01,fontsize='23')
plt.yticks(fontsize=22)
plt.tick_params(left = False)
plt.yticks([])
plt.tick_params(bottom = False)
plt.xticks([])
# ax2 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[2]"
ax2 = fig.add_subplot(spec[2])
qlist = pm25_qvalue_df['q'] 
llist = pm25_qvalue_df['l'] 
llist_rec = [1/x for x in llist]
sns.scatterplot(llist,qlist, data=pm25_qvalue_df, hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'], ax=ax2,s=30,linewidth=0.3 ,alpha=0.9)
ax2.set_xscale('log')
ax2.set_xlim(left=0.003, right=150)
ax2.grid(False)
ax2.set_ylim(bottom=0.5, top=1.7)
ax2.xaxis.label.set_visible(False)
ax2.yaxis.label.set_visible(False)
ax2.set_title('PM2.5', y=-0.01,fontsize='23')
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
# ax3 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[3]"
ax3 = fig.add_subplot(spec[3])
qlist = pm10_qvalue_df['q'] 
llist = pm10_qvalue_df['l'] 
llist_rec = [1/x for x in llist]
sns.scatterplot(llist,qlist, data=pm10_qvalue_df, hue='area_type',
                palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'], ax=ax3,s=30,linewidth=0.3 ,alpha=0.9)
ax3.set_xscale('log')
ax3.set_xlim(left=0.003, right=150)
ax3.grid(False)
ax3.set_ylim(bottom=0.5, top=1.7)
ax3.xaxis.label.set_visible(False)
ax3.yaxis.label.set_visible(False)
ax3.set_title('PM10', y=-0.01,fontsize='23')
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
plt.tick_params(left = False)
plt.yticks([])
fig.text(0.5, 0.04, '$\lambda$', ha='center',fontsize="27")
fig.text(0.06, 0.5, 'q value', va='center', rotation='vertical',fontsize='27')

ax0.text(0.95, 0.9,('a'), transform=ax0.transAxes, size=28)    
ax1.text(0.95, 0.9,('b'), transform=ax1.transAxes, size=28)
ax2.text(0.95, 0.9,('c'), transform=ax2.transAxes, size=28)
ax3.text(0.95, 0.9,('d'), transform=ax3.transAxes, size=28)

ax0.get_legend().remove()
ax1.get_legend().remove()
ax2.get_legend().remove()
ax3.get_legend().remove()
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles=handles[0:], labels=labels[0:],loc='upper center', bbox_to_anchor=(0, 1.35),ncol=3, fancybox=True, shadow=True,fontsize='23')
plt.savefig('q vs lambda fig.pdf',bbox_inches='tight')

plt.show()


## 5.) Plot violin plot

In [None]:
#reset format font 
from matplotlib import rcParams, rcParamsDefault
rcParams.update(rcParamsDefault)

In [None]:
#compute how many sites for each type
ut_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='urban traffic'])
srt_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='suburban/rural traffic'])
ub_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='urban background'])
sb_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='suburban background'])
rb_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='rural background'])
ui_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='urban industrial'])
sri_len_no=len(no_qvalue_df[no_qvalue_df['q']>0][no_qvalue_df['area_type']=='suburban/rural industrial'])

ut_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='urban traffic'])
srt_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='suburban/rural traffic'])
ub_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='urban background'])
sb_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='suburban background'])
rb_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='rural background'])
ui_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='urban industrial'])
sri_len_no2=len(no2_qvalue_df[no2_qvalue_df['q']>0][no2_qvalue_df['area_type']=='suburban/rural industrial'])

ut_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='urban traffic'])
srt_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='suburban/rural traffic'])
ub_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='urban background'])
sb_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='suburban background'])
rb_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='rural background'])
ui_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='urban industrial'])
sri_len_pm25=len(pm25_qvalue_df[pm25_qvalue_df['q']>0][pm25_qvalue_df['area_type']=='suburban/rural industrial'])

ut_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='urban traffic'])
srt_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='suburban/rural traffic'])
ub_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='urban background'])
sb_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='suburban background'])
rb_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='rural background'])
ui_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='urban industrial'])
sri_len_pm10=len(pm10_qvalue_df[pm10_qvalue_df['q']>0][pm10_qvalue_df['area_type']=='suburban/rural industrial'])


In [None]:
# importing required libraries
from matplotlib import gridspec

# create a figure
fig = plt.figure()

# to change size of subplot's
fig.set_figheight(18)
fig.set_figwidth(18)

# create grid for different subplots
spec = gridspec.GridSpec(ncols=2, nrows=2,
						width_ratios=[1, 1], wspace=0.2,
						hspace=0.2, height_ratios=[1, 1])

# ax0 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[0]"
ax0 = fig.add_subplot(spec[0])
qlist = no_qvalue_df['q'] 
llist = no_qvalue_df['l'] 
sns.violinplot(x = "l", y="area_type", data=no_qvalue_df,width=0.9,
               order=["urban traffic","suburban/rural traffic","urban industrial","urban background","suburban background","suburban/rural industrial",'',""],
               palette=['darkorchid','red','darkorange','yellow','blue','aqua'],
               scale='width',linewidth=3,ax=ax0,cut=0,bw=0.2,hue="area_type",dodge=False) 
sns.set_style("whitegrid")
ax0.yaxis.label.set_visible(False)
ax0.xaxis.label.set_visible(False)
ax0.set_yticklabels(['ut\n'+str(ut_len_no)+'','srt\n'+str(srt_len_no)+'','ui\n'+str(ui_len_no)+'','ub\n'+str(ub_len_no)+'','sb\n'+str(sb_len_no)+'','sri\n'+str(sri_len_no)+''], fontsize='25')
ax0.tick_params(axis='x', length=0, labelsize='25')
ax0.set_title('NO', y=-0.005,fontsize='23')

# Create a set of inset Axes: these should fill the bounding box allocated to them.
ax5 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax0, [0,0.12,1,0.124])
ax5.set_axes_locator(ip)
sns.violinplot(x = "l", y="area_type", data=no_qvalue_df, width=0.9,order = ['rural background'], linewidth=3,ax=ax5, palette=['limegreen'],cut=0,bw=0.2)
[i.set_linewidth(0.9) for i in ax5.spines.values()]
[i.set_edgecolor('limegreen') for i in ax5.spines.values()]
ax5.yaxis.label.set_visible(False)
ax5.xaxis.label.set_visible(False)
ax5.set_yticklabels(['rb\n'+str(rb_len_no)+''], fontsize='25')
ax5.tick_params(axis='x', length=0, labelsize='21',colors='k',bottom=True)

# ax1 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[1]"
ax1 = fig.add_subplot(spec[1])
qlist = no2_qvalue_df['q'] 
llist = no2_qvalue_df['l'] 
sns.violinplot(x = "l", y="area_type", data=no2_qvalue_df,width=0.9,
               order=["urban traffic","suburban/rural traffic","urban background","urban industrial","suburban background","","",""],
               palette=['darkorchid','red','darkorange','yellow','blue','aqua'],
               scale='width',linewidth=3,ax=ax1,cut=0,bw=0.2,hue="area_type",dodge=False) 
sns.set_style("whitegrid")
ax1.yaxis.label.set_visible(False)
ax1.xaxis.label.set_visible(False)
ax1.set_yticklabels(['ut\n'+str(srt_len_no2)+'','srt\n'+str(sb_len_no2)+'','ub\n'+str(ub_len_no2)+'','ui\n'+str(ut_len_no2)+'','sb\n'+str(sri_len_no2)+''], fontsize='25')
ax1.tick_params(axis='x', length=0, labelsize='25')
ax1.set_title('NO2', y=-0.005,fontsize='23')
#ax1.set_xlim(left=0.01,right=0.12)


# Create a set of inset Axes: these should fill the bounding box allocated to them.
ax6 = plt.axes([1,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0,0.1,1,0.27])
ax6.set_axes_locator(ip)
sns.violinplot(x = "l", y="area_type", data=no2_qvalue_df, width=0.9,order = ["suburban/rural industrial",'rural background'], 
               linewidth=3,ax=ax6, palette=['yellow','limegreen'],cut=0,bw=0.2)
[i.set_linewidth(1.9) for i in ax6.spines.values()]
[i.set_edgecolor('gold') for i in ax6.spines.values()]
ax6.yaxis.label.set_visible(False)
ax6.xaxis.label.set_visible(False)
ax6.set_yticklabels(['sri\n'+str(ui_len_no2)+'','rb\n'+str(rb_len_no2)+''], fontsize='25')
ax6.tick_params(axis='x', length=0, labelsize='21',colors='k',bottom=True)


# ax2 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[2]"
ax2 = fig.add_subplot(spec[2])
qlist = pm25_qvalue_df['q'] 
llist = pm25_qvalue_df['l'] 
sns.violinplot(x = "l", y="area_type", data=pm25_qvalue_df,width=0.9,
               order=["urban traffic","suburban background","suburban/rural industrial","urban background","urban industrial","rural background","suburban/rural traffic"],
               palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],
               scale='width',linewidth=3,ax=ax2,cut=0,bw=0.2,hue="area_type",dodge=False) 
sns.set_style("whitegrid")
ax2.yaxis.label.set_visible(False)
ax2.xaxis.label.set_visible(False)
ax2.set_yticklabels(['ut\n'+str(ut_len_pm25)+'','sb\n'+str(sb_len_pm25)+'','sri\n'+str(sri_len_pm25)+'',
                     'ub\n'+str(ub_len_pm25)+'','ui\n'+str(ui_len_pm25)+'','rb\n'+str(rb_len_pm25)+'','srt\n'+str(srt_len_pm25)+''], fontsize='25')
ax2.tick_params(axis='x', length=0, labelsize='25')
ax2.set_title('PM2.5', y=-0.005,fontsize='23')


# ax3 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[3]"
ax3 = fig.add_subplot(spec[3])
qlist = pm10_qvalue_df['q'] 
llist = pm10_qvalue_df['l'] 
sns.violinplot(x = "l", y="area_type", data=pm10_qvalue_df,width=0.9,
               order=["urban background","urban industrial","urban traffic","suburban/rural industrial","suburban background","rural background","suburban/rural traffic"],
               palette=['darkorchid','red','darkorange','yellow','blue','aqua','limegreen'],
               scale='width',linewidth=3,ax=ax3,cut=0,bw=0.2,hue="area_type",dodge=False) 
sns.set_style("whitegrid")
ax3.yaxis.label.set_visible(False)
ax3.xaxis.label.set_visible(False)
ax3.set_yticklabels(['ub\n'+str(ub_len_pm10)+'','ui\n'+str(ui_len_pm10)+'','ut\n'+str(ut_len_pm10)+'','sri\n'+str(sri_len_pm10)+'','sb\n'+str(sb_len_pm10)+'','rb\n'+str(rb_len_pm10)+'','srt\n'+str(srt_len_pm10)+''], fontsize='25')
ax3.tick_params(axis='x', length=0, labelsize='25')
ax3.set_title('PM10', y=-0.005,fontsize='23')

#border width
for axis in ['top', 'bottom', 'left', 'right']:
    ax0.spines[axis].set_linewidth(2)
    ax0.spines[axis].set_color('black') 
    ax1.spines[axis].set_linewidth(2)
    ax1.spines[axis].set_color('black') 
    ax2.spines[axis].set_linewidth(2)
    ax2.spines[axis].set_color('black') 
    ax3.spines[axis].set_linewidth(2)
    ax3.spines[axis].set_color('black')     
    ax5.spines[axis].set_linewidth(2)
    
ax0.text(0.9, 0.9,('a'), transform=ax0.transAxes, size=28)    
ax1.text(0.9, 0.9,('b'), transform=ax1.transAxes, size=28)
ax2.text(0.9, 0.9,('c'), transform=ax2.transAxes, size=28)
ax3.text(0.9, 0.9,('d'), transform=ax3.transAxes, size=28)


fig.text(0.5, 0.08, '$\lambda$', ha='center',fontsize="27")
fig.text(0.04, 0.5, 'area type', va='center', rotation='vertical',fontsize='27')
ax0.get_legend().remove()
ax1.get_legend().remove()
#ax2.get_legend().remove()
ax3.get_legend().remove()

handles, labels = ax2.get_legend_handles_labels()
ax2.legend(handles=ax2.legend_.legendHandles, labels=["urban traffic (ut)","suburban/rural traffic (srt)", 'urban industrial (ui)',"suburban/rural industrial (sri)",'urban background (ub)','suburban background (sb)','rural background (rb)'],
           loc='upper center', bbox_to_anchor=(1.1, 2.45),ncol=3, fancybox=True, shadow=True,fontsize='19')

plt.savefig('lambda violin fig.pdf',bbox_inches='tight')

plt.show()


## 6.) examples of PDF with cut-off options

In [None]:
# importing required libraries
from matplotlib import gridspec

# create a figure
fig = plt.figure()

# to change size of subplot's
fig.set_figheight(4)
fig.set_figwidth(15)

# create grid for different subplots
spec = gridspec.GridSpec(ncols=2, nrows=1,
						width_ratios=[1, 1], wspace=0.2)

# ax0 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[0]"
ax0 = fig.add_subplot(spec[0])
#choose a site
importdata = pd.read_csv('at30101.csv')
data1 = importdata[importdata["variable"]=="no"]["value"]
data1=data1[data1>0]
data2 = [x for x in data1 if str(x) != 'nan']
data2=np.array(data2)
datarange=1.5*np.mean(data2)
data_close_to_peak=data2[data2<datarange]
hist_lines=sns.distplot(data_close_to_peak, norm_hist=True, kde=True, color='gold',kde_kws={"bw":0.05,"gridsize":max(500,round(10*datarange))});
(xvalues_hist,yvalues_hist)=hist_lines.get_lines()[0].get_data()
#define the concentration which appears the most
peakOfdata=xvalues_hist[np.where(yvalues_hist==max(yvalues_hist))[0][0]]
y_peak=np.where(yvalues_hist==max(yvalues_hist))[0][0]
print(peakOfdata)
ax0.plot([peakOfdata,peakOfdata],[0,yvalues_hist[y_peak]],'k')
#ax.set_xlim(0,16)
#ax.set_ylim(0,0.6)
ax0.set_ylabel('Density',fontsize=18);
ax0.set_xlabel('NO Concentration [$\mu g m^{-3}$]',fontsize=18);
ax0.plot(xvalues_hist[y_peak:],yvalues_hist[y_peak:],'deepskyblue')
ax0.plot(xvalues_hist[:y_peak+1],yvalues_hist[:y_peak+1],'r')
ax0.annotate('Peak', xy=(1.8, 0.47),xycoords='data',xytext=(0.28, 0.85),  textcoords='axes fraction',arrowprops=dict(facecolor='black', shrink=0.02),fontsize="16")

plt.yticks(fontsize=14)
plt.xticks(fontsize=14)

# ax1 will take 0th position in
# geometry(Grid we created for subplots),
# as we defined the position as "spec[1]"
ax1 = fig.add_subplot(spec[1])
importdata = pd.read_csv('sk0008a.csv')
data1 = importdata[importdata["variable"]=="pm2.5"]["value"]
data1=data1[data1>0]
data2 = [x for x in data1 if str(x) != 'nan']
data2=np.array(data2)
datarange=1.5*np.mean(data2)
data_close_to_peak=data2[data2<datarange]
hist_lines=sns.distplot(data_close_to_peak, norm_hist=True, kde=True, color='limegreen',kde_kws={"bw":0.2,"gridsize":max(500,round(10*datarange))});
(xvalues_hist,yvalues_hist)=hist_lines.get_lines()[0].get_data()
#define the concentration which appears the most
peakOfdata=xvalues_hist[np.where(yvalues_hist==max(yvalues_hist))[0][0]]
y_peak=np.where(yvalues_hist==max(yvalues_hist))[0][0]
print(peakOfdata)
ax1.plot([peakOfdata,peakOfdata],[0,yvalues_hist[y_peak]],'k')
ax1.set_xlabel('PM2.5 Concentration [$\mu g m^{-3}$]',fontsize=18);
ax1.plot(xvalues_hist[y_peak:],yvalues_hist[y_peak:],'deepskyblue')
ax1.plot(xvalues_hist[:y_peak+1],yvalues_hist[:y_peak+1],'r')
ax1.annotate('Peak', xy=(9.5, 0.053),xycoords='data',xytext=(0.35, 0.7),  textcoords='axes fraction',arrowprops=dict(facecolor='black', shrink=0.02),fontsize="16")

plt.yticks(fontsize=14)
plt.xticks(fontsize=14)

plt.savefig('peak fig.pdf',bbox_inches='tight')
plt.show()


## 7.) Define fitting functions

In [None]:
#choose site, pollutant type and log-scale setting
def func_4fits_tail(code,pol,uselogscale=True):
    #check if the location has name
    if isNaN(europe_meta[europe_meta['site']==code]['site_name']).bool():
        return print('site name unkonwn')
    #check if the location can be found on map
    elif math.isnan(europe_meta[europe_meta['site']==code]['latitude']):
        return print('location unkonwn')
    elif math.isnan(europe_meta[europe_meta['site']==code]['longitude']):
        return print('location unkonwn')
    #check if the area type is defined
    elif isNaN(europe_meta[europe_meta['site']==code]['site_type']).bool():
        return print('area type not defined')
    elif isNaN(europe_meta[europe_meta['site']==code]['site_area']).bool():
        return print('area type not defined')   
    else:
        #Choose site
        importdata = pd.read_csv(''+str(code)+'.csv')
        if (importdata.empty):
            return print('empty file')
        else:
            pollutant=importdata["variable"].drop_duplicates()
            if pol in pollutant.tolist():
                importdata_pol = importdata[importdata["variable"]==pol]
                #check the unit to be ug.m-3
                importdata_pol=importdata_pol[importdata_pol["unit"]=='ug.m-3']
                #remove zeros and negative values
                importdata_pol=importdata_pol[importdata_pol["value"]>0]
                #Check if the amount of dataset is fewer than a year
                if len(importdata_pol)<365*24:
                    return print('less than one year data')
                else:                   
                    len_validity = len(importdata_pol[importdata_pol["validity"]>1])                  
                    # Check if censored data consist 15% of the entire dataset
                    if len_validity/len(importdata_pol)>0.15:
                        return print('censored data > 15%')                                                
                    else:
                        data1 = importdata_pol["value"]
                        data2 = Counter(data1)
                        # Check if the most occured data consist 15% of the entire dataset
                        if data2.most_common(1)[0][1]/len(data1)>0.15:
                            return ('Too many duplicated data')  
                        else:                           
                            data_full=np.array(data1)                           
            else:
                return print('no pollutant data')         
                
    #set up subplots
    f, (ax1) = plt.subplots(1, figsize=(0.6,0.5)) #subplot for histogram, which we clear later
    f, (ax2) = plt.subplots(1, figsize=(10,6))
    #plot histograms
    datarange=1.5*np.mean(data_full)
    data_close_to_peak=data_full[data_full<datarange]
    hist_lines=sns.distplot(data_close_to_peak, norm_hist=True, kde=True, color='gold',kde_kws={"bw":0.2,"gridsize":max(500,round(10*datarange))});
    (xvalues_hist,yvalues_hist)=hist_lines.get_lines()[0].get_data()
    #define the concentration which appears the most
    peakOfdata=xvalues_hist[np.where(yvalues_hist==max(yvalues_hist))[0][0]]
    #only consider the data higher than the peak
    data_tail=data_full[data_full>peakOfdata]
    plt.cla();

    #max of the data
    xmax = max(data_tail)
    #generate an instance of the q-exponential, we need a=0 as lower bound for the support of the function            
    q_exp=qExp_gen(name="q exponential",a=0)
    #MLE for exponential distribution (built in function)
    parameters_exp_MLE=expon.fit(data_tail, floc=peakOfdata)
    #return shape parameter , lambda = 1/shape parameter
    #MLE for q-exponential distribution (built in function)    
    parameters_Qexp_MLE=q_exp.fit(data_tail,1.1,1.1,floc=peakOfdata,fscale=1)  
    #range of maximum of data
    xRange=np.arange(0,xmax,0.1)
    xRange_exp=np.arange(peakOfdata,xmax,0.1)
    
    #plot PDF 
    sns.distplot(data_full, norm_hist=True, kde=False, ax=ax2)
    #exponential fit, note the prefactor to adjust that we plot the full data range but only fit right of the peak
    ax2.plot(xRange_exp,len(data_tail)/len(data_full)*exp_func((xRange_exp-peakOfdata),1/parameters_exp_MLE[1]))
    #q-exponential fit, note the prefactor to adjust that we plot the full data range but only fit right of the peak
    ax2.plot(xRange_exp,len(data_tail)/len(data_full)*q_exp_func((xRange_exp-peakOfdata),parameters_Qexp_MLE[1],parameters_Qexp_MLE[0]))
    #gamma fit
    gamma_param=stats.gamma.fit(data_full,floc=0)
    y = stats.gamma.pdf(xRange, gamma_param[0], loc=gamma_param[1], scale=gamma_param[2])
    ax2.plot(xRange,y,'-')
    #lognormal fit
    lognorm_param = stats.lognorm.fit(data_full, floc=0)
    y = stats.lognorm.pdf(xRange, lognorm_param[0], loc = lognorm_param[1], scale=lognorm_param[2])
    ax2.plot(xRange,y,'-')    
    #weibull fit
    weibull_param = stats.weibull_min.fit(data_full, floc=0)
    y = stats.weibull_min.pdf(xRange, weibull_param[0], loc = weibull_param[1], scale=weibull_param[2])
    ax2.plot(xRange,y,'-')    
       
    #formatting of plot
    ax2.set_title(code)
    ax1.axis('off')
    if uselogscale:
        ax2.set_yscale('log')
        ax2.set_title('PDF on log scale')
    ax2.set_ylabel('PDF',fontsize=17);
    ax2.set_xlim(left=0, right=xmax)
    ax2.set_xlabel(str(pol)+' concentration [$\mu g m^{-3}$]',fontsize=17);
    
    ax2.legend(['exponential','q-exponential','gamma','lognormal','weibull','Data'],fontsize=15)

    return   
    f.subplots_adjust(wspace =1.3)
    plt.show()