In [9]:
import pandas as pd
import os

# Initialize an empty list to store the dataframes
dataframes = []

# Specify the directory containing the CSV files
directory_path = "../../CSE_DATA/SHARE_CHANGE/"

filenames = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the data
data = pd.DataFrame()

# Loop through the CSV files and read them into DataFrames
for file in filenames:
    df = pd.read_csv(file)
    # Extract the label from the file name and remove the ".csv" extension
    label = os.path.basename(file).replace(".csv", "")
    # Add the label as a new column
    df['Label'] = label
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Filter out data associated with the label 'SWAD.N0000'
combined_df = combined_df[combined_df['Label'] != 'SWAD.N0000']

# Convert 'Trade Date' from MM/DD/YY to DD/MM/YYYY format
combined_df['Trade Date'] = pd.to_datetime(combined_df['Trade Date'], format='%m/%d/%y').dt.strftime('%d/%m/%Y')

# Split 'Trade Date' into 'Day', 'Month', and 'Year' columns
combined_df[['Day', 'Month', 'Year']] = combined_df['Trade Date'].str.split('/', expand=True)


# Select the columns 'Label', 'Day', 'Month', 'Year', and 'Close (Rs.)'
selected_columns = ['Label', 'Day', 'Month', 'Year', 'Close (Rs.)']
result_df = combined_df[selected_columns]

# Sort the DataFrame by 'Label' and the new date columns
result_df = result_df.sort_values(by=['Year', 'Month', 'Day', 'Label'])

print(result_df)

             Label Day Month  Year  Close (Rs.)
216103   JKH.N0000  17    08  1986          0.0
132130   SUN.N0000  30    03  1990         12.5
74593   GREG.N0000  02    04  1990          0.5
401097  SELI.N0000  10    01  1991        550.0
463724  SHAL.N0000  08    02  1991        575.0
...            ...  ..   ...   ...          ...
166672  VPEL.N0000  13    10  2023          6.8
336467  WAPO.N0000  13    10  2023         31.3
446639  WATA.N0000  13    10  2023         74.0
69288   WIND.N0000  13    10  2023         19.0
132589  YORK.N0000  13    10  2023        168.5

[630051 rows x 5 columns]


In [11]:
import pandas as pd

# Read the CSV file into a DataFrame
election_df = pd.read_csv('./ELECTION_DATA_SRI_LANKA.csv')

# Define a function to split the date into day, month, and year
def split_date(date_str):
    day, month, year = map(int, date_str.split('.'))
    return day, month, year

# Apply the split_date function to "Date From" and "Date To" columns
election_df[['Start Day', 'Start Month', 'Start Year']] = election_df['Date From'].apply(split_date).apply(pd.Series)
election_df[['End Day', 'End Month', 'End Year']] = election_df['Date To'].apply(split_date).apply(pd.Series)

# Drop the original date columns
election_df = election_df.drop(['Date From', 'Date To'], axis=1)

# Rename columns
election_df = election_df.rename(columns={'Election Number': 'Election Number',
                        'Election Name': 'Election Name',
                        'Start Day': 'Start Day',
                        'Start Month': 'Start Month',
                        'Start Year': 'Start Year',
                        'End Day': 'End Day',
                        'End Month': 'End Month',
                        'End Year': 'End Year'})

# Print the resulting DataFrame
print(election_df)

    Election Number                                      Election Name  \
0                 1                 General Election - 1 State Council   
1                 2                 General Election - 2 State Council   
2                 3  General Election - House of Representatives (1...   
3                 4  General Election - House of Representatives  (...   
4                 5  General Election - House of Representatives  (...   
..              ...                                                ...   
75               75  General Election - 8 Parliament of the D.S.R. ...   
76               76                          Local Government Election   
77               77                          Elpitiya Pradeshiya Sabha   
78               78                          Presidential Election (8)   
79               79  General Election - 9 Parliament of the D.S.R. ...   

    Start Day  Start Month  Start Year  End Day  End Month  End Year  
0          13            6        1931  

In [19]:
# Read the date_ranges from the CSV file
date_ranges_df = pd.read_csv('./ELECTION_DATA_SRI_LANKA.csv')

# Initialize an empty DataFrame to store the filtered results
filtered_results = pd.DataFrame(columns=selected_columns)

# Iterate through the date ranges from the CSV file
for index, row in date_ranges_df.iterrows():
    date_from = pd.to_datetime(row['Date From'], format='%d.%m.%Y')
    date_to = pd.to_datetime(row['Date To'], format='%d.%m.%Y')

    # Filter the DataFrame for each date range
    filtered_df = result_df[
        (pd.to_datetime(result_df['Year'] + result_df['Month'] + result_df['Day'], format='%Y%m%d') >= date_from) &
        (pd.to_datetime(result_df['Year'] + result_df['Month'] + result_df['Day'], format='%Y%m%d') <= date_to)
    ]

    # Append the filtered results to the final DataFrame
    filtered_results = pd.concat([filtered_results, filtered_df])

# Sort the final DataFrame by 'Label' and the new date columns
filtered_results = filtered_results.sort_values(by=['Year', 'Month', 'Day', 'Label'])

print(filtered_results)

             Label Day Month  Year  Close (Rs.)
9591     ACL.N0000  17    05  1993        72.50
9592     BHR.N0000  17    05  1993        39.75
9593    BOGA.N0000  17    05  1993         5.75
9594     CCS.N0000  17    05  1993        18.00
9595    CFIN.N0000  17    05  1993         0.00
...            ...  ..   ...   ...          ...
531472  VFIN.N0000  06    08  2020        58.60
531473   VLL.N0000  06    08  2020         4.90
531474  VONE.N0000  06    08  2020        15.00
531475  VPEL.N0000  06    08  2020         6.00
531476  WATA.N0000  06    08  2020        30.00

[3165 rows x 5 columns]


In [20]:
# Read the date_ranges from the CSV file
date_ranges_df = pd.read_csv('./ELECTION_DATA_SRI_LANKA.csv')

# Initialize the "Election" column with 0 for all rows
result_df['Election'] = 0

# Iterate through the date ranges from the CSV file
for index, row in date_ranges_df.iterrows():
    date_from = pd.to_datetime(row['Date From'], format='%d.%m.%Y')
    date_to = pd.to_datetime(row['Date To'], format='%d.%m.%Y')

    # Create a mask to identify rows that satisfy the date range condition
    mask = (
        (pd.to_datetime(result_df['Year'] + result_df['Month'] + result_df['Day'], format='%Y%m%d') >= date_from) &
        (pd.to_datetime(result_df['Year'] + result_df['Month'] + result_df['Day'], format='%Y%m%d') <= date_to)
    )

    # Update the "Election" column to 1 for the rows that satisfy the condition
    result_df.loc[mask, 'Election'] = 1

# Sort the final DataFrame by 'Label' and the new date columns
result_df = result_df.sort_values(by=['Year', 'Month', 'Day', 'Label'])

print(result_df)

             Label Day Month  Year  Close (Rs.)  Election
0        JKH.N0000  17    08  1986          0.0         0
1        SUN.N0000  30    03  1990         12.5         0
2       GREG.N0000  02    04  1990          0.5         0
3       SELI.N0000  10    01  1991        550.0         0
4       SHAL.N0000  08    02  1991        575.0         0
...            ...  ..   ...   ...          ...       ...
630046  VPEL.N0000  13    10  2023          6.8         0
630047  WAPO.N0000  13    10  2023         31.3         0
630048  WATA.N0000  13    10  2023         74.0         0
630049  WIND.N0000  13    10  2023         19.0         0
630050  YORK.N0000  13    10  2023        168.5         0

[630051 rows x 6 columns]


In [21]:
result_df.to_csv('filtered_data_with_election.csv', index=False)