# Data gathering

This jupyter notebook can be used to combine dataframes together and remove any potential duplicates

## Required packages

In [None]:
import pandas as pd # data analysis
from tqdm import tqdm # progress bars
import pickle # object serialization
import glob # Unix style pathname pattern expansion
import os # Miscellaneous operating system interfaces

## Change directory if needed

In [None]:
# Get the current working directory
os.getcwd()

In [None]:
# Change directory
newdir = '.' # "Here". I don't want to change directory now.
os.chdir(newdir)

## Define the retrieval operation as a function

We do this to manage to "discard" the associated temporary variables created during the retrieval process. They will be removed from memory once the execution of the function is finished and its frame disappears.

In [None]:
def retrieve_combine():
    df_list = [] # We will store the dataframes in a list
    for file in glob.glob("*.df"): # For each .df file,
        df_list.append(pickle.load(open(file, 'rb'))) # Load the dataframe
    # Combine them all in one dataframe
    df = pd.concat(df_list, ignore_index=True)
    return df

In [None]:
df = retrieve_combine()

In [None]:
df.info()

In [None]:
len(df)

In [None]:
df = df.drop_duplicates()

In [None]:
len(df)

In [None]:
# Save the dataframe to disk
df.to_pickle("Historic_Observations.df")

In [None]:
df.to_csv("Historic_Observations.csv")