## Part 1 Correlation of Mean Annual Temperature with Altitude in Bavaria
In the first lectures we analysed the annual temperature in NRW by means of long time series. The observed temperature increase particularly in the last decade is most likely an indication of climate. We also observed that the station "Kahler Asten" shows systematic lower temperatures than other stations. We presumed this being an effect of decreasing temperature with topographic height, since "Kahler Asten" is among the highest points in NRW. 

Verify this hypothesis by means of data in Bavaria. This federal state reveals the broadest range of topographic heights, from 100m to more than 2800m above Normal-Null (NN). 

## Task 1
Plot the annual mean temperatures of **years 2017, 2018, and 2019** versus altitude for the DWD stations in Bavaria. At first use the **altitudes from the station description file** `KL_Jahreswerte_Beschreibung_Stationen.txt` for the data set `/annual/kl/historical/`.

## Importing necessary libaries for Part 1

In [1]:
from datetime import datetime #used for time format conversion
import os #access to host system to create directories and write files
import ftplib #libary to access ftp server
import codecs
from zipfile import ZipFile #used for unzipping zip files
import numpy as np #for replacing bad values with true NotaNumber from numpy
import time
import matplotlib.pyplot as plt
%matplotlib inline 
#making plots available in jupyter output line
import pandas as pd #for pandas dataframe to read csv
pd.options.display.max_seq_items = None #pandas printing options
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', 50)
#pd.set_option('display.width', 1000)
from sklearn.linear_model import LinearRegression #linear regression to calculate a trendline from data points, Task 4

## Defining variables

In [2]:
ftp_server = "opendata.dwd.de"
ftp_user = "anonymous"
ftp_passwd = ""
ftp_dir =  "/climate_environment/CDC/observations_germany/climate/annual/kl/historical/"
station_desc_pattern = "_Beschreibung_Stationen.txt"
state = "Bayern"
years = [2017, 2018, 2019]
nyears = len(years)

## Process functions

In [3]:
def connect_ftp(): #establishing connection to ftp server and check if it was successfull
    global ftp
    ftp = ftplib.FTP(host=ftp_server,user=ftp_user,passwd=ftp_passwd,timeout=None)
    ftp.cwd(ftp_dir) #change ftp directory 

In [4]:
def create_dir(): #create directories for datasets
    global dir
    dir = os.getcwd()
    global local_dir
    local_dir = dir+"data/"
    os.makedirs(local_dir,exist_ok = True)

In [9]:
def gen_df_from_ftp_dir():
    lines = []
    flist = []
    try:    
        res = ftp.retrlines("NLST", lines.append)
    except:
        return
    for line in lines:
        fname = line
        if (line.split("_")[0] == 'jahreswerte'):
            station_id = line.split("_")[1]
            fname = line #['KL_Jahreswerte_Beschreibung_Stationen.txt', 'jahreswerte_KL_00001_19310101_19851231_hist.zip',...]
            flist.append([station_id, fname])
        elif (line.split("_")[0] == 'KL'):
            global station_desc_fname
            station_desc_fname = line
        else:None;
    df = pd.DataFrame(flist,columns=["station_id","fname"]) 
    #dates from file names differ from station description eg station_id 769 date_to 2005 in filename, 2020 in station_desc, so choosing dates from description and merging with filenames by station_id
    return df

In [13]:
    #not working
    with ftp.open(station_desc_fname) as f:
        df1 = pd.read_fwf(f)
        print(df1)

AttributeError: 'FTP' object has no attribute 'open'

In [None]:
def station_desc_txt_to_df():
    local_fullname = local_dir+station_desc_fname
    localfile = open(local_fullname, 'wb')
    ftp.retrbinary('RETR ' + station_desc_fname, localfile.write, 1024)
    localfile.close()
    ftp_dir + station_fname, local_ftp_station_dir + station_fname)


    file = codecs.open(txtfile,"r","utf-8")
    r = file.readline()
    file.close()
    colnames_de = r.split()
    translate =     {'Stations_id':'station_id',
     'von_datum':'date_from',
     'bis_datum':'date_to',
     'Stationshoehe':'altitude',
     'geoBreite': 'latitude',
     'geoLaenge': 'longitude',
     'Stationsname':'name',
     'Bundesland':'state'}
    colnames_en = [translate[h] for h in colnames_de]
    df = pd.read_fwf(txtfile,skiprows=2,colspecs='infer',names=colnames_en, parse_dates=["date_from","date_to"],index_col = 0)
    return(df)

In [None]:
def download_stations():
    global local_zip_list
    local_zip_list = []
    for station_id in station_ids_selected:
        try:
            fname = df_zips["name"][station_id]
            grabFile(ftp_dir + fname, local_ftp_ts_dir + fname)
            local_zip_list.append(fname)
        except:
            ("")

In [None]:
def kl_ts_to_df(fname): 
    dateparse = lambda dates: [datetime.strptime(str(d), '%Y%m%d') for d in dates]
    df = pd.read_csv(fname, delimiter=";", encoding="utf8", index_col="MESS_DATUM_BEGINN", parse_dates = ["MESS_DATUM_BEGINN", "MESS_DATUM_ENDE"], date_parser = dateparse, na_values = [-999.0, -999])
    df = df[(df.index >= date_from) & (df.index <= date_to)]
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    df.index.name = df.index.name.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
    return(df)

In [None]:
def ts_merge():
    df = pd.DataFrame()
    for elt in local_zip_list:
        ffname = local_ftp_ts_dir + elt
        with ZipFile(ffname) as myzip:
            # read the time series data from the file starting with "produkt"
            prodfilename = [elt for elt in myzip.namelist() if elt.split("_")[0]=="produkt"][0] 
            with myzip.open(prodfilename) as myfile:
                dftmp = kl_ts_to_df(myfile)
                if len(dftmp) > 0:
                    s = dftmp["ja_tt"].rename(dftmp["stations_id"][0]).to_frame()
                    df = pd.merge(df, s, left_index=True, right_index=True, how='outer')
                else:
                    ("")
    df = df.dropna(axis='columns')
    df.index.rename(name = "time", inplace = True)
    return(df)

In [None]:
def ts_append():
    df = pd.DataFrame()
    for elt in local_zip_list:
        ffname = local_ftp_ts_dir + elt
        with ZipFile(ffname) as myzip:
            prodfilename = [elt for elt in myzip.namelist() if elt.split("_")[0]=="produkt"][0]
            with myzip.open(prodfilename) as myfile:
                dftmp = kl_ts_to_df(myfile)
                if len(dftmp) > 0:
                    dftmp = dftmp.merge(df_stations,how="inner",left_on="stations_id",right_on="station_id",right_index=True)
                    df = df.append(dftmp)
                else:
                    ("")
    df.index.rename(name = "time", inplace = True)
    
    df.replace(to_replace = -999,value = (np.nan),inplace=True)
    
    df = df.dropna(subset = [(str(o1)),(str(o2))])
    
    #ind1 = df[df[str(o1)]==-999].index
    #df.drop(ind1,inplace=True)
    #ind2 = df[df[str(o2)]==-999].index
    #df.drop(ind2,inplace=True)
    return(df)

In [None]:
def plot():
    retranslate = {"ja_tt":"Average Temperature","ja_tx":"Yearly Average Max Temperature","ja_tn":"Yearly Average Min Temperature","ja_fk":"Average Windforce","ja_sd_s":"Sum Yearly Sunshine Duration","ja_mx_tx":"Absolute Max Temperature","ja_mx_tn":"Absolute Min Temperature","ja_rr":"Sum Yearly Precipitation","ja_mx_rs":"Max Precipitation Height","altitude":"Altitude","latitude":"Latitude","longitude":"Longitude"}
    po1 = retranslate[(o1)]
    po2 = retranslate[(o2)]
    fpo1 = po1.replace(" ", "_")
    fpo2 = po2.replace(" ", "_")

    df_plot = df_appended_ts
    
    df_corr = pd.DataFrame(df_appended_ts.loc[:,o2])
    df_corr[o1] = df_appended_ts.loc[:,o1]
    Y = df_appended_ts.loc[:,o1].values.reshape(-1, 1)
    X = df_appended_ts.loc[:,o2].values.reshape(-1, 1)
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    score = linear_regressor.score(X, Y)
    Y_pred = linear_regressor.predict(X)

    
    fig1, ax1 = plt.subplots(dpi=136, figsize=(8,6))
    b = round((linear_regressor.intercept_[0]),4)
    m = round((linear_regressor.coef_[0][0]),4)
    sx = 0.35 * ax1.get_xlim()[1]
    sy = 1.69 * ax1.get_ylim()[0]
    r = round(score,4)
    ax1.plot(X, Y_pred, color='red')
    ax1.plot(df_plot[o2],df_plot[o1],".")
    ax1.set_ylabel(po1)
    ax1.set_xlabel(po2)
    ax1.set_title(po1+" vs. "+po2+" in Year " + year_selected + " at DWD Stations in " + state+"\ny="+str(m)+"*x+"+str(b)+", R^2= "+str(r))

    #ax1.text(x=sx,y=sy,s=("y="+str(m)+"*x + "+str(b)+", R^2= "+str(r)))

    ax1.grid(True)
    plt.show()
    fig1.savefig(fpo1+"_"+fpo2+"_"+year_selected+"_DWD_Stations_"+state+".png")
    print("A low R^2 value indicates, that the regression model is not fitting well (no strong correlation of data points).\n")

In [None]:
def process():
    create_dir()
    connect_ftp()
    global df_zips
    df_zips = gen_df_from_ftp_dir()
    df_zips.set_index("station_id", inplace = True)
    
    station_grab()
    global basename = os.path.splitext(station_fname)[0]
    global df_stations = station_desc_txt_to_csv(local_ftp_station_dir + station_fname, local_station_dir + basename + ".csv")
    global station_ids_selected = df_stations[df_stations['state'].str.contains(state)].index
    download_stations()
    global df_merged_ts
    df_merged_ts = ts_merge()
    df_merged_ts.to_csv(local_ts_merged_dir + "ts_merged.csv",sep=";")
    global df_appended_ts = ts_append()
    df_appended_ts.to_csv(local_ts_appended_dir + "ts_appended.csv",sep=";")

## Main run function

In [1]:
print("Loading...\n")
process()
print("Plotting...\n")
plot()

Loading...



NameError: name 'process' is not defined

In [10]:
create_dir()
connect_ftp()
global df_zips
df_zips = gen_df_from_ftp_dir()
df_zips.set_index("station_id", inplace = True)