#Data Cleaning John Preston


In [None]:
#Importing libraries
import os

import pandas as pd

In [None]:
#Mounting Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# root_dir is the path to your My_Drive folder.
root_dir = "/content/drive/My Drive/"

#storing the data folder in the Colab Notebooks directory at the top level of the Google Drive
data_folder = root_dir + "Colab Notebooks/MPP Science Replication Package/Data/"

# change directory to the data folder
os.chdir(data_folder)

In [None]:
def dta_to_csv(dta_file_path):
  """
  takes a path to a .dta file and converts it to a .csv. The result
  is stored a subdirectory of the original directory named "/csv".

  Arguments:
    dta_file_path: Full path to the .dta file

  """

  # get the directory and file name from the full path
  directory_name, file_name = os.path.split(dta_file_path)

  # read the .dta file into a DataFrame
  print("Converting file", dta_file_path)
  data = pd.io.stata.read_stata(dta_file_path)

  # save the DataFrame as a ".csv" to the "csv" directory in the original path (directory_name).
  # And get rid of the ".dta" by slicing the file_name
  data.to_csv(directory_name + "/csv/" +  file_name[:-4] +  ".csv")

In [None]:
def convert_all_dta_to_csv(data_folder):
  """
  Traverse the directory that has path name data_folder. Look for
  .dta files and call dta_to_csv() to convert them to .csv
  """

  # Walk the directory structure
  for root, direc, files in os.walk(data_folder):
    for file in files:
      file_path = os.path.join(root, file)

      # split the file into its name and extension
      filename, extension = os.path.splitext(file)

      # does the file have a .dta extension
      if extension.lower() == ('.dta'):
        dir_path = root

        # Create a 'csv' subdirectory if it doesn't exist
        csv_dir_path = os.path.join(dir_path, 'csv')
        if not os.path.exists(csv_dir_path):
          os.makedirs(csv_dir_path)

        # call the conversion function
        dta_to_csv(file_path)

# call the function to do the conversion
convert_all_dta_to_csv(data_folder)

Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Randomization and heterogeneity.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/cctv_baseline data.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/cctv_full data.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Citizen survey/citizen_full data.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Citizen survey/citizen_caw rates.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Admin/admin_long data.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Admin/admin_wide data.dta
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Police survey/police_baseline data.dta
Con

#Data Cleaning: Admin_long, Police_full, Police_baseline, and Citizen_full.


In [None]:
def list_datasets(data_folder):
    """
    List all dataset files (.csv, .dta, .xlsx) in the data_folder directory.
    """
    # Supported dataset extensions
    dataset_extensions = ['.csv']

    # Walk through the directory
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            # Split file name and extension
            filename, extension = os.path.splitext(file)

            # If the extension is in the supported list, print the file path
            if extension.lower() in dataset_extensions:
                print(os.path.join(root, file))


# List the datasets available in the folder
list_datasets(data_folder)

/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/admin_long.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/police_full.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/police_base.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/citizen_full.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/csv/cctv_baseline data.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/csv/cctv_full data.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Citizen survey/csv/citizen_caw rates.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Citizen survey/csv/citizen_full data.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Admin/csv/admin_long data.csv
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package

In [None]:
# Observing and cleaning of Admin_long dataset
admin_long_data = '/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Admin/csv/admin_long data.csv'

#Creating df for admin long data
admin_long_df = pd.read_csv(admin_long_data)

In [None]:
#Using head() to get overview of df
admin_long_df.head()

Unnamed: 0.1,Unnamed: 0,month,fir_overall_count,fir_caw_count,fir_bywomen_count,arrest_count,dir_count,ncr_count,dial100_count,ps_code,...,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
0,0,2019-06-01,35,8,5,20,0,100,40.0,1001.0,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
1,1,2019-11-01,29,10,4,10,1,0,35.0,1001.0,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
2,2,2018-10-01,26,8,3,12,0,72,25.0,1001.0,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
3,3,2018-11-01,27,8,4,12,0,64,41.0,1001.0,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
4,4,2020-05-01,32,6,4,10,0,22,55.0,1001.0,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0


In [None]:
#Checking the shape of admin log data
admin_long_df.shape

(4500, 22)

In [None]:
#Overview of Data types and other information
admin_long_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              4500 non-null   int64  
 1   month                   4500 non-null   object 
 2   fir_overall_count       4500 non-null   int64  
 3   fir_caw_count           4500 non-null   int64  
 4   fir_bywomen_count       4500 non-null   int64  
 5   arrest_count            4500 non-null   int64  
 6   dir_count               4500 non-null   int64  
 7   ncr_count               4500 non-null   int64  
 8   dial100_count           4500 non-null   float64
 9   ps_code                 4500 non-null   float64
 10  population              4500 non-null   int64  
 11  urban                   4500 non-null   object 
 12  dist_urban              4500 non-null   float64
 13  strat_pca               4500 non-null   float64
 14  treatment               4500 non-null   

In [None]:
#Overview of basic staistics
admin_long_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,4500.0,2249.5,1299.182435,0.0,1124.75,2249.5,3374.25,4499.0
fir_overall_count,4500.0,33.33067,23.118133,0.0,18.0,28.0,43.0,419.0
fir_caw_count,4500.0,3.958667,3.754298,0.0,1.0,3.0,6.0,23.0
fir_bywomen_count,4500.0,2.606667,2.065057,0.0,1.0,2.0,4.0,11.0
arrest_count,4500.0,3.590444,4.861089,0.0,0.0,2.0,5.0,52.0
dir_count,4500.0,0.5071111,2.100433,0.0,0.0,0.0,0.0,33.0
ncr_count,4500.0,43.77689,34.747282,0.0,19.0,36.0,59.0,243.0
dial100_count,4500.0,26.79978,20.344907,0.0,13.0,21.0,35.0,254.0
ps_code,4500.0,5857.167,3109.786359,1001.0,3045.75,5090.5,8135.25,12180.0
population,4500.0,130072.9,86182.930694,23565.0,76491.5,111183.5,151937.0,545000.0


In [None]:
#Checking the value count for our variables: month and fir_bywomen
month_counts = admin_long_df['month'].value_counts()
print(month_counts)

fir_bywomen_counts = admin_long_df['fir_bywomen_count'].value_counts()
print(fir_bywomen_counts)

month
2019-06-01    180
2019-10-01    180
2019-02-01    180
2019-08-01    180
2020-04-01    180
2020-01-01    180
2019-12-01    180
2018-07-01    180
2019-01-01    180
2019-04-01    180
2020-02-01    180
2018-05-01    180
2018-12-01    180
2019-11-01    180
2018-09-01    180
2019-09-01    180
2020-03-01    180
2019-07-01    180
2018-06-01    180
2019-03-01    180
2018-08-01    180
2020-05-01    180
2018-11-01    180
2018-10-01    180
2019-05-01    180
Name: count, dtype: int64
fir_bywomen_count
1     841
2     808
3     759
0     748
4     520
5     391
6     218
7     122
8      58
9      20
10     10
11      5
Name: count, dtype: int64


In [None]:
# Saving the data to "admin_long_df.csv"
admin_long_df.to_csv('/content/drive/MyDrive/admin_long_df.csv', index=False)

In [None]:
# Observing and cleaning of Police_full
police_full = '/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Police survey/csv/police_full data.csv'

#Creating df for Police_full data.
police_full_df = pd.read_csv(police_full)

In [None]:
#Using head() to get overview of df
police_full_df.head()

Unnamed: 0.1,Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,...,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female
0,0,2240137,male,too much attention,very effective,common,very helpful,more effective,Less effective,female,...,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
1,1,2240133,male,too much attention,very effective,common,very helpful,much more effective,No difference,female,...,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
2,2,2240136,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,...,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
3,3,2240132,female,too much attention,very effective,common,helpful,much more effective,Less effective,female,...,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
4,4,2240134,male,too much attention,very effective,very common,very helpful,much more effective,No difference,female,...,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6


In [None]:
#Checking the shape of police_full
police_full_df.shape

(1961, 38)

In [None]:
#Overview of Data types and checking for any missing values
police_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              1961 non-null   int64  
 1   uid                     1961 non-null   int64  
 2   gender                  1961 non-null   object 
 3   e_wcase                 1948 non-null   object 
 4   e_effective             1959 non-null   object 
 5   e_false_case            1932 non-null   object 
 6   e_helpful               1958 non-null   object 
 7   e_add_officer           1955 non-null   object 
 8   e_add_female            1956 non-null   object 
 9   e_female_better         1956 non-null   object 
 10  e_taken_seriously       1951 non-null   object 
 11  e_prof_dev              1951 non-null   object 
 12  e_work_help             1954 non-null   object 
 13  e_pol_impt              1961 non-null   float64
 14  e_thana_impt            1961 non-null   

In [None]:
#There are missing values in the above columns. Checking to see if any missing variables are numerical or important to the project.
police_full_df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,1961.0,980.0,566.24,0.0,490.0,980.0,1470.0,1960.0
uid,1961.0,2621999.48,307310.94,2141731.0,2357536.0,2554033.0,2845339.0,3257341.0
e_pol_impt,1961.0,0.39,0.49,0.0,0.0,0.0,1.0,1.0
e_thana_impt,1961.0,0.45,0.5,0.0,0.0,0.0,1.0,1.0
e_sensitivity,1961.0,7.54,2.55,0.0,6.0,7.0,9.0,19.0
ps_code,1961.0,5831.55,3104.93,1001.0,3046.0,5090.0,8135.0,12180.0
population,1961.0,130111.21,86078.79,23565.0,76376.0,111000.0,152000.0,545000.0
dist_urban,1961.0,8.57,4.89,1.0,4.0,8.0,12.0,18.0
strat_pca,1961.0,0.01,1.37,-2.36,-1.13,-0.12,1.0,3.99
dist_id,1961.0,5741.46,3054.43,1000.0,3000.0,5000.0,8000.0,12000.0


In [None]:
#Checking the value count for our variables: women_whd, treatment, e_effective, and b_helpful
women_whd_counts = police_full_df['women_whd'].value_counts()
print(women_whd_counts)

treatment_counts = police_full_df['treatment'].value_counts()
print(treatment_counts)

e_effective_counts = police_full_df['e_effective'].value_counts()
print(e_effective_counts)

b_helpful_counts = police_full_df['b_helpful'].value_counts()
print(b_helpful_counts)


women_whd
0.0    1316
1.0     645
Name: count, dtype: int64
treatment
Treatment    1306
Control       655
Name: count, dtype: int64
e_effective
very effective      1174
effective            776
ineffective            7
very ineffective       2
Name: count, dtype: int64
b_helpful
5.000000    742
4.000000    378
4.545454     54
4.454546     54
4.500000     51
4.750000     46
4.727272     42
4.363637     41
4.666666     40
4.818182     33
4.909091     33
4.900000     32
4.636363     31
4.444445     29
4.583334     27
4.700000     23
4.833334     23
4.300000     22
4.555555     22
4.800000     21
4.600000     18
4.888889     18
4.083334     18
4.846154     15
4.166666     14
4.777778     14
4.333334     12
4.416666     11
4.250000     10
4.272728      9
4.538462      9
3.000000      8
3.909091      6
4.230769      6
4.375000      6
4.200000      6
4.916666      6
4.692308      6
4.714286      5
4.461538      5
4.181818      4
4.400000      4
4.090909      3
2.000000      2
4.875000      1


In [None]:
# Saving the data to "police_full_df.csv"
police_full_df.to_csv('/content/drive/MyDrive/police_full.csv',index=False)

In [None]:
# Observing and cleaning of Police_baseline dataset.
police_base = '/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Police survey/csv/police_baseline data.csv'

#Creating df for Police_base data.
police_base_df = pd.read_csv(police_base)

In [None]:
#Getting overview of data set
police_base_df.head()

Unnamed: 0.1,Unnamed: 0,b_uid,gender,b_pol_impt,b_thana_impt,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,...,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
0,0,2240165,m,0.0,0.0,enough attention,very effective,helpful,more effective,much less effective,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
1,1,2240171,m,1.0,1.0,enough attention,effective,helpful,much more effective,less effective,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
2,2,2240167,m,1.0,1.0,too much attention,very effective,helpful,more effective,much less effective,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
3,3,2240172,m,0.0,0.0,too much attention,effective,helpful,much more effective,less effective,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
4,4,2240169,m,0.0,0.0,too much attention,effective,helpful,much more effective,much less effective,...,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0


In [None]:
#Checking data shape for police_baseline
police_base_df.shape

(1950, 27)

In [None]:
#Overview of police_baseline data info
police_base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950 entries, 0 to 1949
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              1950 non-null   int64  
 1   b_uid                   1950 non-null   int64  
 2   gender                  1950 non-null   object 
 3   b_pol_impt              1918 non-null   float64
 4   b_thana_impt            1924 non-null   float64
 5   b_wcase                 1948 non-null   object 
 6   b_effective             1943 non-null   object 
 7   b_helpful               1947 non-null   object 
 8   b_add_officer           1948 non-null   object 
 9   b_add_female            1949 non-null   object 
 10  b_female_better         1947 non-null   object 
 11  b_sensitivity           1945 non-null   float64
 12  uid                     1139 non-null   float64
 13  attrit                  1950 non-null   float64
 14  ps_code                 1950 non-null   

In [None]:
#Using .describe to view police_baseline
police_base_df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,1950.0,974.5,563.06,0.0,487.25,974.5,1461.75,1949.0
b_uid,1950.0,2621708.63,309351.97,2141761.0,2356266.25,2553965.5,2847364.75,3257271.0
b_pol_impt,1918.0,0.38,0.49,0.0,0.0,0.0,1.0,1.0
b_thana_impt,1924.0,0.4,0.49,0.0,0.0,0.0,1.0,1.0
b_sensitivity,1945.0,7.04,2.45,1.0,5.0,7.0,8.0,16.0
uid,1139.0,2601023.96,297854.78,2141733.0,2353639.5,2551635.0,2753741.5,3257341.0
attrit,1950.0,0.42,0.49,0.0,0.0,0.0,1.0,1.0
ps_code,1950.0,5824.59,3132.06,1001.0,3044.0,5089.0,8136.0,12180.0
population,1950.0,129263.01,84928.09,23565.0,76376.0,111000.0,151916.0,545000.0
dist_urban,1950.0,8.54,4.93,1.0,4.0,8.0,12.0,18.0


In [None]:
#Checking variables associated with the police_baseline dataset(Gender and b_effective). Already checked police_full variables.
gender_counts = police_base_df['gender'].value_counts()
print(gender_counts)

b_effective_counts = police_base_df['b_effective'].value_counts()
print(b_effective_counts)


gender
m    1713
f     237
Name: count, dtype: int64
b_effective
very effective                       1210
effective                             692
neither effective nor ineffective      29
ineffective                             7
very ineffective                        5
Name: count, dtype: int64


In [None]:
# Saving the data to "police_base_df.csv"
police_base_df.to_csv('/content/drive/MyDrive/police_base.csv', index=False)

In [None]:
#Reviewing the last dataset for questions: Citizen_full data
citizen_full = '/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Citizen survey/csv/citizen_full data.csv'

#Creating df for Citizen_full data.
citizen_full_df = pd.read_csv(citizen_full)

In [None]:
#Overview of Citizen_full dataset
citizen_full_df.head()

Unnamed: 0.1,Unnamed: 0,uid,b_visit,b_pol_handling,b_safety,e_urja_knowledge,e_visit,e_pol_handling,e_safety,spw,...,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,attrited
0,0,10512011,0.0,0.444444,3.666667,No,0.0,0.5,4.0,0.001526,...,-0.004295,Treatment,regular mhd,3000.0,7.0,3.717949,2.15,1.0,0.0,0.0
1,1,10512021,0.0,0.0,3.666667,,,,,0.001526,...,-0.004295,Treatment,regular mhd,3000.0,7.0,3.717949,2.15,1.0,0.0,1.0
2,2,10512031,0.0,-0.111111,2.333333,No,0.0,0.25,2.0,0.001526,...,-0.004295,Treatment,regular mhd,3000.0,7.0,3.717949,2.15,1.0,0.0,0.0
3,3,10512041,0.0,-0.2,3.666667,No,0.0,,3.0,0.001526,...,-0.004295,Treatment,regular mhd,3000.0,7.0,3.717949,2.15,1.0,0.0,0.0
4,4,10512051,0.0,0.0,3.333333,,,,,0.001526,...,-0.004295,Treatment,regular mhd,3000.0,7.0,3.717949,2.15,1.0,0.0,1.0


In [None]:
#Checking the shape of the data
citizen_full_df.shape

(6519, 27)

In [None]:
#Reviewing Citizen_full info and dtypes
citizen_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              6519 non-null   int64  
 1   uid                     6519 non-null   int64  
 2   b_visit                 6519 non-null   float64
 3   b_pol_handling          6334 non-null   float64
 4   b_safety                6480 non-null   float64
 5   e_urja_knowledge        3294 non-null   object 
 6   e_visit                 3376 non-null   float64
 7   e_pol_handling          3112 non-null   float64
 8   e_safety                3372 non-null   float64
 9   spw                     6519 non-null   float64
 10  e_gender                3376 non-null   object 
 11  hh_id                   6519 non-null   float64
 12  member_gender           6519 non-null   object 
 13  ps_code                 6519 non-null   float64
 14  population              6519 non-null   

In [None]:
#There is alot of data not needed in this dataset. Dropping some of the columns not needed from the dataset.
citizen_full_df = citizen_full_df.drop(columns=['b_safety', 'e_urja_knowledge', 'spw', 'hh_id'])


In [None]:
#Checking to see if columns were dropped
citizen_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              6519 non-null   int64  
 1   uid                     6519 non-null   int64  
 2   b_visit                 6519 non-null   float64
 3   b_pol_handling          6334 non-null   float64
 4   e_visit                 3376 non-null   float64
 5   e_pol_handling          3112 non-null   float64
 6   e_safety                3372 non-null   float64
 7   e_gender                3376 non-null   object 
 8   member_gender           6519 non-null   object 
 9   ps_code                 6519 non-null   float64
 10  population              6519 non-null   int64  
 11  urban                   6519 non-null   object 
 12  dist_urban              6519 non-null   float64
 13  strat_pca               6519 non-null   float64
 14  treatment               6519 non-null   

In [None]:
#Using .describe to view citizen_full_df
citizen_full_df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,6519.0,3259.0,1882.02,0.0,1629.5,3259.0,4888.5,6518.0
uid,6519.0,38617732.64,20188911.78,10512011.0,21562126.0,40552300.0,50632461.0,80602481.0
b_visit,6519.0,0.07,0.26,0.0,0.0,0.0,0.0,1.0
b_pol_handling,6334.0,0.33,0.48,-1.0,0.0,0.33,0.75,1.0
e_visit,3376.0,0.11,0.32,0.0,0.0,0.0,0.0,1.0
e_pol_handling,3112.0,0.54,0.47,-1.0,0.2,0.67,1.0,1.0
e_safety,3372.0,3.19,0.49,1.0,3.0,3.33,3.5,4.0
ps_code,6519.0,5963.58,3137.8,1001.0,4048.0,5092.0,8136.0,12180.0
population,6519.0,130475.79,86245.59,23565.0,77000.0,111367.0,151916.0,545000.0
dist_urban,6519.0,8.75,4.92,1.0,5.0,8.0,12.0,18.0


In [None]:
#Checking variables from the dataset that I will be using: B_visit,e_visit, B_pol_handling, e_pol_handling, member_gender
b_visit_counts = citizen_full_df['b_visit'].value_counts()
print(b_visit_counts)

e_visit_counts = citizen_full_df['e_visit'].value_counts()
print(e_visit_counts)

b_pol_handling_counts = citizen_full_df['b_pol_handling'].value_counts().round(2)
print(b_pol_handling_counts)

e_pol_handling_counts = citizen_full_df['e_pol_handling'].value_counts().round(2)
print(e_pol_handling_counts)

member_gender_counts = citizen_full_df['member_gender'].value_counts()
print(member_gender_counts)

b_visit
0.0    6039
1.0     480
Name: count, dtype: int64
e_visit
0.0    2999
1.0     377
Name: count, dtype: int64
b_pol_handling
 0.000000    938
 1.000000    901
 0.500000    329
 0.333333    314
 0.666667    272
 0.888889    171
 0.250000    169
 0.444444    162
 0.555556    160
 0.111111    139
 0.777778    134
 0.222222    132
 0.125000    120
-1.000000    117
 0.750000    116
 0.375000    114
 0.285714    103
-0.333333     99
 0.428571     99
 0.166667     98
 0.571429     91
-0.250000     88
 0.625000     88
 0.142857     80
 0.200000     78
 0.714286     78
 0.600000     76
 0.857143     76
 0.400000     69
 0.833333     68
 0.800000     67
-0.111111     66
-0.500000     62
-0.125000     60
 0.875000     60
-0.222222     57
-0.142857     56
-0.666667     46
-0.444444     37
-0.166667     37
-0.285714     34
-0.375000     33
-0.750000     27
-0.428571     26
-0.200000     24
-0.555556     23
-0.400000     21
-0.777778     18
-0.571429     17
-0.625000     14
-0.600000     14
-0

In [None]:
# Saving the data to "citizen_full_df.csv"
citizen_full_df.to_csv('/content/drive/MyDrive/citizen_full.csv', index=False)