- Environment Setup 

In [1]:
import pandas as pd
from datetime import timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Acquire

In [2]:
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [3]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [4]:
df.shape

(8599212, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [6]:
df.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,8235082.0,8235082.0
mean,16.72743,1.028575
std,10.35344,1.129733
min,-42.704,0.034
25%,10.299,0.337
50%,18.831,0.591
75%,25.21,1.349
max,39.651,15.396


In [7]:
df.isnull().sum()

dt                                    0
AverageTemperature               364130
AverageTemperatureUncertainty    364130
City                                  0
Country                               0
Latitude                              0
Longitude                             0
dtype: int64

# Preparation

In [8]:
## Only looking for observations in city of San Antonio, TX

san_antonio = df[df['City'] == 'San Antonio']

In [9]:
san_antonio.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
6618616,1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W
6618617,1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W
6618618,1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W
6618619,1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W
6618620,1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W


In [10]:
san_antonio.shape

(2325, 7)

In [11]:
san_antonio.isnull().sum()

dt                               0
AverageTemperature               9
AverageTemperatureUncertainty    9
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64

In [12]:
san_antonio_0 = san_antonio[san_antonio['AverageTemperature'].isnull()]

In [13]:
san_antonio_0

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
6618638,1821-11-01,,,San Antonio,United States,29.74N,97.85W
6618639,1821-12-01,,,San Antonio,United States,29.74N,97.85W
6618640,1822-01-01,,,San Antonio,United States,29.74N,97.85W
6618641,1822-02-01,,,San Antonio,United States,29.74N,97.85W
6618642,1822-03-01,,,San Antonio,United States,29.74N,97.85W
6618648,1822-09-01,,,San Antonio,United States,29.74N,97.85W
6618649,1822-10-01,,,San Antonio,United States,29.74N,97.85W
6618650,1822-11-01,,,San Antonio,United States,29.74N,97.85W
6618651,1822-12-01,,,San Antonio,United States,29.74N,97.85W


In [14]:
## several months between year 1821 and 1822 are missing, all observations prior should be excluded 

In [15]:
# Reassign the sale_date column to be a datetime type
san_antonio.dt = pd.to_datetime(san_antonio.dt)

In [16]:
san_antonio.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2325 entries, 6618616 to 6620940
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   dt                             2325 non-null   datetime64[ns]
 1   AverageTemperature             2316 non-null   float64       
 2   AverageTemperatureUncertainty  2316 non-null   float64       
 3   City                           2325 non-null   object        
 4   Country                        2325 non-null   object        
 5   Latitude                       2325 non-null   object        
 6   Longitude                      2325 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 145.3+ KB


In [17]:
# Sort rows by the date and then set the index as that date
san_antonio = san_antonio.set_index("dt").sort_index()

In [18]:
san_antonio.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W
1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W
1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W
1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W
1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W


In [19]:
san_antonio['month'] = san_antonio.index.month

In [20]:
san_antonio['year'] = san_antonio.index.year

In [21]:
san_antonio.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,month,year
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W,1,1820
1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W,2,1820
1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W,3,1820
1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W,4,1820
1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W,5,1820


In [22]:
## observations prior to 1823 will be removed for missing values 

san_antonio = san_antonio[san_antonio['year'] > 1822]

In [23]:
san_antonio.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,month,year
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1823-01-01,9.425,2.788,San Antonio,United States,29.74N,97.85W,1,1823
1823-02-01,9.864,3.157,San Antonio,United States,29.74N,97.85W,2,1823
1823-03-01,16.228,2.66,San Antonio,United States,29.74N,97.85W,3,1823
1823-04-01,20.097,3.091,San Antonio,United States,29.74N,97.85W,4,1823
1823-05-01,23.811,2.044,San Antonio,United States,29.74N,97.85W,5,1823


In [24]:
san_antonio.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2289 entries, 1823-01-01 to 2013-09-01
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   AverageTemperature             2289 non-null   float64
 1   AverageTemperatureUncertainty  2289 non-null   float64
 2   City                           2289 non-null   object 
 3   Country                        2289 non-null   object 
 4   Latitude                       2289 non-null   object 
 5   Longitude                      2289 non-null   object 
 6   month                          2289 non-null   int64  
 7   year                           2289 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 160.9+ KB


In [25]:
san_antonio.isnull().sum()

AverageTemperature               0
AverageTemperatureUncertainty    0
City                             0
Country                          0
Latitude                         0
Longitude                        0
month                            0
year                             0
dtype: int64

In [28]:
san_antonio = san_antonio.drop(columns = ['City', 'Country', 'Latitude', 'Longitude'])

In [29]:
san_antonio.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,month,year
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1823-01-01,9.425,2.788,1,1823
1823-02-01,9.864,3.157,2,1823
1823-03-01,16.228,2.66,3,1823
1823-04-01,20.097,3.091,4,1823
1823-05-01,23.811,2.044,5,1823


# Write Function

In [30]:
def wrangle_sa_temp():
    
    df = pd.read_csv('GlobalLandTemperaturesByCity.csv')
    
    ## Only looking for observations in city of San Antonio, TX
    san_antonio = df[df['City'] == 'San Antonio']
    
    # Reassign the sale_date column to be a datetime type
    san_antonio.dt = pd.to_datetime(san_antonio.dt)
    
    # Sort rows by the date and then set the index as that date
    san_antonio = san_antonio.set_index("dt").sort_index()
    
    # Create month and year columns
    san_antonio['month'] = san_antonio.index.month
    san_antonio['year'] = san_antonio.index.year
    
    ## observations prior to 1823 will be removed for missing values 
    san_antonio = san_antonio[san_antonio['year'] > 1822]
    
    # Remove unnecesary columns
    san_antonio = san_antonio.drop(columns = ['City', 'Country', 'Latitude', 'Longitude'])
    
    return san_antonio



In [33]:
df2 = wrangle_sa_temp()

In [34]:
df2.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,month,year
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1823-01-01,9.425,2.788,1,1823
1823-02-01,9.864,3.157,2,1823
1823-03-01,16.228,2.66,3,1823
1823-04-01,20.097,3.091,4,1823
1823-05-01,23.811,2.044,5,1823


In [36]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2289 entries, 1823-01-01 to 2013-09-01
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   AverageTemperature             2289 non-null   float64
 1   AverageTemperatureUncertainty  2289 non-null   float64
 2   month                          2289 non-null   int64  
 3   year                           2289 non-null   int64  
dtypes: float64(2), int64(2)
memory usage: 89.4 KB


In [37]:
df2.shape

(2289, 4)