<a href="https://colab.research.google.com/github/haiderali2017/my_exploratory_data_analyses/blob/main/Data_Indicator_4_RTB_Rents_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
df = pd.read_csv('https://data.smartdublin.ie/dataset/4997223b-13b2-4c97-9e88-cd94c6d35aec/resource/9fdf2ee1-cfd8-4fd2-ad2c-be8314aece01/download/indicator-4-rtb-rent-.csv')

# Data Exploration


In [None]:
df.head()

Unnamed: 0,Quarter,Dublin,Greater Dublin Area,Outside GDA,Column2,%YoY Dublin,%YoY Greater Dublin,%YoY Outside GDA,Column4,YoY Dublin,...,QoQ Dublin,Column8,%QoQ Dublin,Column9,Column10,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,Q3 07,"€1,245","€1,000",€767,,,,,,,...,,,,,,,,,,#REF!
1,Q4 07,"€1,301","€1,025",€767,,,,,,,...,€56,,4.5%,2.5%,0.0%,,,,,#REF!
2,Q1 08,"€1,381","€1,027",€712,,,,,,,...,€80,,6.1%,0.2%,-7.2%,,,,,#REF!
3,Q2 08,"€1,380","€1,025",€709,,,,,,,...,-€1,,-0.1%,-0.1%,-0.4%,,,,,
4,Q3 08,"€1,357","€1,011",€697,,9.0%,1.1%,-9.1%,,€112,...,-€23,,-1.7%,-1.4%,-1.6%,,,,,


In [None]:
df.tail()

Unnamed: 0,Quarter,Dublin,Greater Dublin Area,Outside GDA,Column2,%YoY Dublin,%YoY Greater Dublin,%YoY Outside GDA,Column4,YoY Dublin,...,QoQ Dublin,Column8,%QoQ Dublin,Column9,Column10,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
63,Q2 23,"€1,969","€1,413",€997,,5.7%,3.4%,9.2%,,€106,...,€34,,1.8%,1.3%,1.9%,,,,,
64,Q3 23,"€2,006","€1,450","€1,037",,6.3%,6.5%,12.3%,,€119,...,€37,,1.9%,2.6%,4.0%,,,,,
65,Q4 23,"€2,035","€1,469","€1,093",,6.7%,6.0%,12.7%,,€127,...,€29,,1.4%,1.3%,5.4%,,,,,
66,Q1 24,"€2,065","€1,516","€1,091",,6.7%,8.7%,11.4%,,€130,...,€30,,1.5%,3.3%,-0.2%,,,,,
67,,,,,,,,,,,...,,,,,,,,,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Quarter               67 non-null     object 
 1   Dublin                67 non-null     object 
 2   Greater Dublin Area   67 non-null     object 
 3   Outside GDA           67 non-null     object 
 4   Column2               0 non-null      float64
 5   %YoY Dublin           63 non-null     object 
 6   %YoY Greater Dublin   63 non-null     object 
 7   %YoY Outside GDA      63 non-null     object 
 8   Column4               0 non-null      float64
 9   YoY Dublin            63 non-null     object 
 10  Column5               33 non-null     object 
 11  Column6               33 non-null     object 
 12  Column7               33 non-null     object 
 13  QoQ Dublin            66 non-null     object 
 14  Column8               0 non-null      float64
 15  %QoQ Dublin           66 

# Data Cleaning


1. Removing unnecessary columns
2. Removing missing values
3. Removing trailing / leading spaces in column names
4. Removing '€' from column values
5. Removing '%' from column values
6. Changing column data types


### 1. Removing unnecessary columns

In this step, we are dropping columns that are Unnamed as well as unnecessary.

In [None]:
df = df.drop(columns=[f'Unnamed: {i}' for i in range(18, 23)], errors='ignore') # dropping all columns that are unnamed
df = df.drop(columns=[f'Column{i}' for i in [2,4,5,6,7,8,9,10]], errors='ignore') # dropping all columns that are unnecessary

### 2. Removing missing values

In this step, we are removing missing values.

In [None]:
# Check the number of missing values in each column
missing_values = df.isna().sum()

print(missing_values)

Quarter                 1
Dublin                  1
Greater Dublin Area     1
Outside GDA             1
%YoY Dublin             5
%YoY Greater Dublin     5
%YoY Outside GDA        5
YoY Dublin              5
QoQ Dublin              2
%QoQ Dublin             2
dtype: int64


In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df

Unnamed: 0,Quarter,Dublin,Greater Dublin Area,Outside GDA,%YoY Dublin,%YoY Greater Dublin,%YoY Outside GDA,YoY Dublin,QoQ Dublin,%QoQ Dublin
0,Q3 08,"€1,357","€1,011",€697,9.0%,1.1%,-9.1%,€112,-€23,-1.7%
1,Q4 08,"€1,329",€994,€687,2.2%,-3.0%,-10.5%,€28,-€28,-2.0%
2,Q1 09,"€1,287",€969,€671,-6.8%,-5.6%,-5.7%,-€94,-€42,-3.2%
3,Q2 09,"€1,248",€938,€655,-9.5%,-8.5%,-7.5%,-€132,-€39,-3.0%
4,Q3 09,"€1,206",€905,€639,-11.1%,-10.4%,-8.4%,-€151,-€42,-3.4%
...,...,...,...,...,...,...,...,...,...,...
58,Q1 23,"€1,935","€1,395",€979,5.7%,1.9%,9.1%,€104,€28,1.4%
59,Q2 23,"€1,969","€1,413",€997,5.7%,3.4%,9.2%,€106,€34,1.8%
60,Q3 23,"€2,006","€1,450","€1,037",6.3%,6.5%,12.3%,€119,€37,1.9%
61,Q4 23,"€2,035","€1,469","€1,093",6.7%,6.0%,12.7%,€127,€29,1.4%


### 3. Removing trailing / leading spaces in column names


In this step, we are removing trailing and leading spaces in column names.

In [None]:
df.columns = df.columns.str.strip()

### 4. Removing '€' from column values

In this step, we are removing € from records since it causes misleading visuals.

In [None]:
def replace_percentages(df, col_name):
  df[col_name] = df[col_name].replace('€', '', regex=True)

replace_percentages(df, 'Dublin')
replace_percentages(df, 'Greater Dublin Area')
replace_percentages(df, 'Outside GDA')
replace_percentages(df, 'YoY Dublin')
replace_percentages(df, 'QoQ Dublin')

### 5. Removing '%' from column values

In this step, we are removing % from records since it causes misleading visuals.

In [None]:
def replace_percentages(df, col_name):
  df[col_name] = df[col_name].replace('%', '', regex=True)

replace_percentages(df, '%YoY Dublin')
replace_percentages(df, '%YoY Greater Dublin')
replace_percentages(df, '%YoY Outside GDA')
replace_percentages(df, '%QoQ Dublin')

In [None]:
df

Unnamed: 0,Quarter,Dublin,Greater Dublin Area,Outside GDA,%YoY Dublin,%YoY Greater Dublin,%YoY Outside GDA,YoY Dublin,QoQ Dublin,%QoQ Dublin
0,Q3 08,1357,1011,697,9.0,1.1,-9.1,112,-23,-1.7
1,Q4 08,1329,994,687,2.2,-3.0,-10.5,28,-28,-2.0
2,Q1 09,1287,969,671,-6.8,-5.6,-5.7,-94,-42,-3.2
3,Q2 09,1248,938,655,-9.5,-8.5,-7.5,-132,-39,-3.0
4,Q3 09,1206,905,639,-11.1,-10.4,-8.4,-151,-42,-3.4
...,...,...,...,...,...,...,...,...,...,...
58,Q1 23,1935,1395,979,5.7,1.9,9.1,104,28,1.4
59,Q2 23,1969,1413,997,5.7,3.4,9.2,106,34,1.8
60,Q3 23,2006,1450,1037,6.3,6.5,12.3,119,37,1.9
61,Q4 23,2035,1469,1093,6.7,6.0,12.7,127,29,1.4


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Quarter              63 non-null     object
 1   Dublin               63 non-null     object
 2   Greater Dublin Area  63 non-null     object
 3   Outside GDA          63 non-null     object
 4   %YoY Dublin          63 non-null     object
 5   %YoY Greater Dublin  63 non-null     object
 6   %YoY Outside GDA     63 non-null     object
 7   YoY Dublin           63 non-null     object
 8   QoQ Dublin           63 non-null     object
 9   %QoQ Dublin          63 non-null     object
dtypes: object(10)
memory usage: 5.1+ KB


### 6. Changing column data types

In this step, we are converting data types to avoid misleading visuals.

In [None]:
# changing data types of percentage columns
df = df.astype({'%YoY Dublin': 'float', '%YoY Greater Dublin': 'float', '%YoY Outside GDA': 'float', '%QoQ Dublin': 'float'})

# removing commas from euro columns
def replace_percentages(df, col_name):
  df[col_name] = df[col_name].replace(',', '', regex=True)

replace_percentages(df, 'Dublin')
replace_percentages(df, 'Greater Dublin Area')
replace_percentages(df, 'Outside GDA')
replace_percentages(df, 'YoY Dublin')
replace_percentages(df, 'QoQ Dublin')

# changing data types of euro columns
df = df.astype({'Dublin': 'int', 'Greater Dublin Area': 'int', 'Outside GDA': 'int', 'YoY Dublin': 'int', 'QoQ Dublin': 'int'})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Quarter              63 non-null     object 
 1   Dublin               63 non-null     int64  
 2   Greater Dublin Area  63 non-null     int64  
 3   Outside GDA          63 non-null     int64  
 4   %YoY Dublin          63 non-null     float64
 5   %YoY Greater Dublin  63 non-null     float64
 6   %YoY Outside GDA     63 non-null     float64
 7   YoY Dublin           63 non-null     int64  
 8   QoQ Dublin           63 non-null     int64  
 9   %QoQ Dublin          63 non-null     float64
dtypes: float64(4), int64(5), object(1)
memory usage: 5.1+ KB


After performing all the data cleaning steps, our dataset has shrunk to 63 rows and 10 columns.

### Story the Dataset is Telling:

The dataset is related to **rental housing prices** in Ireland, specifically based on data from the **Residential Tenancies Board (RTB)**, which maintains a register of tenancies. This means the dataset is likely tracking **rent prices** in the private rental market across different regions (Dublin, Greater Dublin Area, and Outside GDA).

Here’s how this description enhances the interpretation of the dataset:

1. **Dublin, Greater Dublin Area, Outside GDA:** These columns represent average rent prices (in euros) for private rental housing in those regions.

2. **%YoY Dublin, %YoY Greater Dublin, %YoY Outside GDA:** These columns show the year-over-year percentage change in rent prices for each region. For example, in Q3 2008, rent prices in Dublin increased by 9.0% compared to Q3 2007.

3. **YoY Dublin:** This column represents the absolute change in rent prices (in euros) for Dublin compared to the same quarter in the previous year. For Q3 2008, rents in Dublin increased by €112 compared to Q3 2007.

4. **QoQ Dublin:** This column shows the absolute change in rent prices (in euros) for Dublin compared to the previous quarter (Q2 2008). In Q3 2008, rents decreased by €23 compared to Q2 2008.

5. **%QoQ Dublin:** This column represents the quarter-over-quarter percentage change in rent prices for Dublin. In Q3 2008, rents decreased by 1.7% compared to Q2 2008.

# Data Visualisation


1. Trend of rent prices in Dublin, GDA and outside GDA (Quarter wise)
2. Trend of rent prices in Dublin, GDA and outside GDA (Year wise)
3. Trend of rent prices in Dublin, GDA and outside GDA (Year wise)

### 1. Trend of rent prices in Dublin, GDA and outside GDA (Quarter wise)

In [None]:
fig = px.line(df, x='Quarter', y=["Dublin", "Greater Dublin Area", "Outside GDA"], title='Trend of rent prices in Dublin, GDA and outside GDA')
fig.show()

# 2. Trend of rent prices in Dublin, GDA and outside GDA (Year wise)

In [None]:
# new DataFrame
df_rtb_rents = pd.DataFrame()

# split quarter from year (from original dataset) and put it into newly initialized dataframe
df_rtb_rents['Year'] = df['Quarter'].str.split(' ').str[-1]

# add 20 with year to make it '2012'
df_rtb_rents.loc[:, 'Year'] = '20' + df_rtb_rents['Year']

# converting the type of year column from object to int.
#  the purpose is to have correct plots
df_rtb_rents['Year'] = df_rtb_rents['Year'].astype(str).astype(int)

# removing duplicate rows
df_rtb_rents = df_rtb_rents.drop_duplicates()

# want df_emp to have a fresh index starting from 0
df_rtb_rents.reset_index(drop=True, inplace=True)

In [None]:
column_names = ['Dublin', 'Greater Dublin Area', 'Outside GDA']

# Loop over the column_names
for sectors in column_names:
  # Create a list to store the mean values for each quarter
  mean_values = []

  # Iterate over the quarters 2008 to 2024
  for i in range(8,25):

    # Filter the rows where the 'Quarter' column contains the current quarter (i)
    # and extract the corresponding values for the Dublin, GDA and outside GDA
    values = df.loc[df['Quarter'].str.contains(str(i), na=False), sectors]
    # Calculate the mean of the filtered values
    val = values.mean()
    # Append the calculated mean to the mean_values list
    mean_values.append(int(val))

  # Add the list of mean values to a new dataframe 'df_housing' with the sector name as the column name
  df_rtb_rents[sectors] = mean_values

In [None]:
fig = px.line(df_rtb_rents, x='Year', y=['Dublin', 'Greater Dublin Area', 'Outside GDA'], title='Trend of rent prices in Dublin, GDA and outside GDA')
# Update the x-axis to show all years
fig.update_xaxes(
    tickmode='array',  # Set tick mode to 'array'
    tickvals=df_rtb_rents['Year'],  # Specify the tick values (all years)
    ticktext=df_rtb_rents['Year']   # Specify the tick labels (all years)
)

fig.show()

# 3. Trend of rent prices in Dublin, GDA and outside GDA (Year wise)

In [None]:
years = df_rtb_rents['Year']

fig = go.Figure()
# Three bar traces are added to the figure, one for Dublin, GDA and outside GDA.
fig.add_trace(go.Bar(x=years,
                y=df_rtb_rents['Dublin'],
                name='Dublin',
                marker_color='rgb(7, 43, 186)'
                ))
fig.add_trace(go.Bar(x=years,
                y=df_rtb_rents['Greater Dublin Area'],
                name='Greater Dublin Area',
                marker_color='rgb(186, 7, 7)'
                ))
fig.add_trace(go.Bar(x=years,
                y=df_rtb_rents['Outside GDA'],
                name='Outside GDA',
                marker_color='rgb(7, 186, 31)'
                ))

fig.update_layout(
    title=dict(text='Trend of rent prices in Dublin, GDA and outside GDA'),
    xaxis=dict(
        tickmode='array',  # Set tick mode to 'array'
        tickvals=years,    # Specify the tick values (years)
        ticktext=years,     # Specify the tick labels (years)
        tickfont_size=12,
        title=dict(
            text="Years",
            font=dict(
                size=16
            )
        ),
    ),

    yaxis=dict(
        title=dict(
            text="Rent",
            font=dict(
                size=16
            )
        ),
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()