In [1]:
import numpy as np
import pandas as pd
import pgeocode
import matplotlib.pyplot as plt
import plotly.express as px

results = pd.read_csv('../data/radon-thoron.csv', encoding='latin-1')
results.head()



Unnamed: 0,resultNumber,censusMetropolitanArea,province,forwardSortationAreaCodes,testDurationInDays,averageRadonConcentrationInBqPerM3,averageThoronConcentrationInBqPerM3
0,1,Calgary CMA,AB,*,90,89,14.0
1,2,Calgary CMA,AB,*,92,18,14.0
2,3,Calgary CMA,AB,*,91,114,14.0
3,4,Calgary CMA,AB,*,91,185,14.0
4,5,Calgary CMA,AB,*,91,71,14.0


In [2]:
results.describe()


Unnamed: 0,resultNumber,testDurationInDays,averageRadonConcentrationInBqPerM3,averageThoronConcentrationInBqPerM3
count,3190.0,3190.0,3190.0,3160.0
mean,1595.5,96.203762,117.290909,16.768987
std,921.018006,12.883509,144.367973,12.566019
min,1.0,30.0,14.0,14.0
25%,798.25,91.0,41.0,14.0
50%,1595.5,92.0,75.0,14.0
75%,2392.75,97.0,139.0,14.0
max,3190.0,301.0,2117.0,232.0


In [3]:
cols = ['censusMetropolitanArea', 'province', 'forwardSortationAreaCodes','averageRadonConcentrationInBqPerM3']
df = pd.read_csv('../data/radon-thoron.csv', usecols=cols, encoding='latin-1')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/radon-thoron.csv'

In [None]:
df.describe()

Unnamed: 0,averageRadonConcentrationInBqPerM3
count,3190.0
mean,117.290909
std,144.367973
min,14.0
25%,41.0
50%,75.0
75%,139.0
max,2117.0


In [None]:
df.value_counts()

censusMetropolitanArea  province  forwardSortationAreaCodes  averageRadonConcentrationInBqPerM3
Vancouver CMA           BC        *                          14                                    16
Toronto CMA             ON        *                          14                                     5
Montreal CMA            QC        *                          14                                     5
Toronto CMA             ON        *                          23                                     4
Montreal CMA            QC        *                          22                                     4
                                                                                                   ..
Kitchener CMA           ON        N2G                        21                                     1
                                                             29                                     1
                                                             30                         

In [None]:
print("Number of rows with radon concentration values above 300: ", len(df[df['averageRadonConcentrationInBqPerM3'] > 300])) # Output is number of radon concentration values that are above the 300 range
print("This is equivalent to: ", len(df[df['averageRadonConcentrationInBqPerM3'] > 300])/len(df.index) * 100, "percent of entries")

print()
print("Number of rows with radon concentration values below 100: ", len(df[df['averageRadonConcentrationInBqPerM3'] < 100])) # Output is number of radon concentration values that are below 100
print("This is equivalent to: ", len(df[df['averageRadonConcentrationInBqPerM3'] < 100])/len(df.index) * 100, "percent of entries")

Number of rows with radon concentration values above 300:  205
This is equivalent to:  6.426332288401254 percent of entries

Number of rows with radon concentration values below 100:  2014
This is equivalent to:  63.13479623824452 percent of entries


In [None]:
# Test to manipulate forward sortation codes

nomi = pgeocode.Nominatim('ca')
location = nomi.query_postal_code('L3T')
print(location.latitude)
print(location.longitude)



43.8227
-79.3946


In [None]:
# New columns to store latitude and longitude
df["latitude"] = np.nan
df["longitude"] = np.nan
df.head()

Unnamed: 0,censusMetropolitanArea,province,forwardSortationAreaCodes,averageRadonConcentrationInBqPerM3,latitude,longitude
0,Calgary CMA,AB,*,89,,
1,Calgary CMA,AB,*,18,,
2,Calgary CMA,AB,*,114,,
3,Calgary CMA,AB,*,185,,
4,Calgary CMA,AB,*,71,,


In [None]:
# Populating the new latitude and longitude columns
for row in df.index:
    location = nomi.query_postal_code(df['forwardSortationAreaCodes'][row])
    df['latitude'][row] = location.latitude
    df['longitude'][row] = location.longitude

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['latitude'][row] = location.latitude
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['longitude'][row] = location.longitude
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['latitude'][row] = location.latitude
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['longitude'][row] = location.longitude
A va

Unnamed: 0,censusMetropolitanArea,province,forwardSortationAreaCodes,averageRadonConcentrationInBqPerM3,latitude,longitude
0,Calgary CMA,AB,*,89,,
1,Calgary CMA,AB,*,18,,
2,Calgary CMA,AB,*,114,,
3,Calgary CMA,AB,*,185,,
4,Calgary CMA,AB,*,71,,


In [None]:
# Remove Naan values for plotting purposes
df2 = df.dropna()
df2.head()

Unnamed: 0,censusMetropolitanArea,province,forwardSortationAreaCodes,averageRadonConcentrationInBqPerM3,latitude,longitude
9,Calgary CMA,AB,T1Y,125,51.0823,-113.9578
10,Calgary CMA,AB,T1Y,123,51.0823,-113.9578
11,Calgary CMA,AB,T1Y,114,51.0823,-113.9578
12,Calgary CMA,AB,T1Y,91,51.0823,-113.9578
13,Calgary CMA,AB,T2A,123,51.0494,-113.9564


In [None]:
# Now, let's remove radon concentrations that are below 300
df2 = df2.drop(df2[df2['averageRadonConcentrationInBqPerM3'] <= 300].index)
print("Number of rows here are: ", len(df2))
df2.head()

Number of rows here are:  192


Unnamed: 0,censusMetropolitanArea,province,forwardSortationAreaCodes,averageRadonConcentrationInBqPerM3,latitude,longitude
44,Calgary CMA,AB,T2W,322,50.9514,-114.3591
45,Calgary CMA,AB,T2W,344,50.9514,-114.3591
70,Calgary CMA,AB,T3G,393,51.1387,-114.2015
77,Calgary CMA,AB,T3H,841,51.0419,-114.2
91,Calgary CMA,AB,T3P,309,51.2074,-114.1348


In [None]:
# We will display the data

fig = px.scatter_geo(df2,lat='latitude',lon='longitude', hover_name="averageRadonConcentrationInBqPerM3")
fig.update_layout(title = 'World map', title_x=0.5)
fig.show()