## cleaning ucdp data

- code book is [here](https://ucdp.uu.se/downloads/ged/ged231.pdf)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime


In [1]:
class UCDPCleaner():
    def __init__(self, filename, use_high=False):
        self.filename = filename
        self.data = self.load_data(use_high)
        self.region_name = filename.split('_')[0]
        
    def load_data(self, use_high):
        fn = '../data/ucdp/' + self.filename + '.csv'
        try: ucdp = pd.read_csv(fn)
        except: 
            print(f'Could not read {fn}')
            return
        ucdp['date_start'] = pd.to_datetime(ucdp['date_start'])
        ucdp_sorted = ucdp.sort_values(by="date_start")
        
        if use_high:
            ucdp_sorted.loc[ucdp_sorted['best'] == 0, 'best'] = ucdp_sorted.loc[ucdp_sorted['best'] == 0, 'high']
        
        return ucdp_sorted
    
    def plot(self):
        ucdp = self.data 
        # ucdp['date_start'] = pd.to_datetime(ucdp['date_start'])
        # ucdp_sorted = ucdp.sort_values(by="date_start")

        dates_ucdp = ucdp["date_start"]
        target_ucdp = ucdp["best"]

        print(f'dates range from {min(dates_ucdp).date()} to {max(dates_ucdp).date()}')

        plt.figure(figsize=(12, 6))
        plt.plot(dates_ucdp, target_ucdp, label='UCDP Estimate')  # Changed plot to scatterplot
        plt.xlabel('Date')
        plt.ylabel('Fatalities')
        
        plt.title(f'Fatalities Over Time (UCDP {self.region_name})')
        plt.legend()
        plt.show
    
    ## use high estimates if best is zero 
    # figure out conflict iD code duplicates 
    def duration(self):
        data = self.data
        data['date_start'] = pd.to_datetime(data['date_start'])
        data['date_end'] = pd.to_datetime(data['date_end'])
        data['duration'] = data['date_end'] - data['date_start'] 
        # data.loc[data['best'] == 0, 'best'] = data.loc[data['best'] == 0, 'high'] 
        
        return data


In [24]:
cleaner = UCDPCleaner('south_sudan')

duration = cleaner.duration()
print(duration.iloc[3]['date_start'])

2016-01-01 00:00:00


In [25]:
print(duration.iloc[3]['best'])

1


In [31]:
len(cleaner.data['priogrid_gid'].unique())
# split by dyad_new_id or conflict_id
# there are 300 data points in the largest conflict area 

127