In [28]:
import re
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm

# Specify the directory containing the CSV files
data_directory = 'telcodataBrandNEW/'

# Create an empty DataFrame to store the results
result_df = pd.DataFrame(columns=['egoid', 'alterid', 'p', 'd', 'q', 'log likelihood', 'AIC'])

# Process files in batches of 500
batch_size = 500
file_list = os.listdir(data_directory)

try:
    for i in range(18, len(file_list), batch_size):
        batch_files = file_list[i:i + batch_size]
        print(f'Processing batch: {i} batch')
        for filename in batch_files:
            if filename.endswith(".csv"):
                # Extract egoid and alterid from the filename
                egoid, alterid = filename.replace('.csv', '').split('_')
                print(f'Processing file {filename}')

                # Load the data
                df_data = pd.read_csv(os.path.join(data_directory, filename))
                timestamps = pd.to_datetime(df_data['timestamp'], format="%Y-%m-%d %H:%M:%S")
                timestamps_df = pd.DataFrame(timestamps)
                timestamps_df['timeDiff'] = (timestamps_df['timestamp'] - timestamps_df['timestamp'].min()).dt.total_seconds()/(3600*24)
                timestamps_df = timestamps_df.sort_values(by='timestamp')

                if len(timestamps_df) < 41:
                    print(f'LENGTH OUT OF BOUNDS, SKIPPING {filename}')
                    continue

                # Convert timestamps to event times
                time_series = timestamps_df.set_index('timestamp')['timeDiff']
                # print(time_series)
                # Fit ARIMA model
                model_fit = pm.auto_arima(time_series, suppress_warnings=True, seasonal=False)
                params = model_fit.get_params()
                p, d, q = params['order']
                print(f'p: {p}, d: {d}, q: {q}')
                summary = model_fit.summary()

                log_likelihood_match = re.search(r'Log Likelihood\s+(-?\d+\.\d+)', str(summary))
                aic_match = re.search(r'AIC\s+(-?\d+\.\d+)',  str(summary))

                if log_likelihood_match:
                    log_likelihood_value = float(log_likelihood_match.group(1))
                
                    print("Log-Likelihood:", log_likelihood_value)
                else:
                    log_likelihood_value = 0.0
                    print("Log-Likelihood NOT FOUND:", log_likelihood_value)

                if aic_match:
                    aic_match_value = float(aic_match.group(1))

                    print("aic:", aic_match_value)
                else:
                    aic_match_value = 0.0
                    print("aic NOT FOUND:", aic_match_value)
                    
                


                # print(results.summary())
                # # Extract ARIMA parameters
                # p, d, q = results.p, results.d, results.q

                # Append results to the result DataFrame
                result_df = result_df.append({
                    'egoid': egoid,
                    'alterid': alterid,
                    'p': p,
                    'd': d,
                    'q': q,
                    'log likelihood': log_likelihood_value,
                    'AIC': aic_match_value,
                }, ignore_index=True)
except :
    print("Process interrupted. Saving current results...")
    result_df.to_csv('ARIMAResultFits_interrupted_11thDec.csv', index=False)
    print("Results saved.")
    raise  # Re-raise the KeyboardInterrupt to exit the program

# Save the result DataFrame to a CSV file
result_df.to_csv('ARIMAResultFits_11thDec.csv', index=False)


Processing batch: 18 batch
Processing file 11002_298357.csv
p: 3, d: 2, q: 3
Log-Likelihood: -2409.77
aic: 4833.541
Processing file 11002_334783.csv
p: 0, d: 2, q: 1
Log-Likelihood: -2650.161
aic: 5304.322
Processing file 11002_48676.csv
p: 0, d: 1, q: 0
Log-Likelihood: -6064.468
aic: 12132.936
Processing file 11002_503852.csv
p: 5, d: 2, q: 0
Log-Likelihood: -5068.088
aic: 10148.177
Processing file 11002_551644.csv
p: 1, d: 2, q: 4
Log-Likelihood: -4941.482
aic: 9894.965
Processing file 11002_559519.csv
p: 0, d: 1, q: 0
Log-Likelihood: -1197.852
aic: 2399.705
Processing file 11002_79395.csv
p: 0, d: 2, q: 2
Log-Likelihood: -1796.173
aic: 3598.346
Processing file 11002_796159.csv
p: 0, d: 1, q: 0
Log-Likelihood: -1096.087
aic: 2196.175
Processing file 11002_823378.csv
p: 0, d: 2, q: 1
Log-Likelihood: -2290.117
aic: 4584.234
Processing file 11002_858353.csv
p: 5, d: 2, q: 3
Log-Likelihood: -3240.105
aic: 6500.21
Processing file 11148_140926.csv
p: 0, d: 2, q: 1
Log-Likelihood: -3097.164

KeyboardInterrupt: 

In [None]:
import re
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm

# Specify the directory containing the CSV files
data_directory = 'telcodataOrganisedWITHEGOIDSONLY/'

# Create an empty DataFrame to store the results
result_df = pd.DataFrame(columns=['egoid', 'alterid', 'p', 'd', 'q', 'log likelihood', 'AIC'])

# Process files in batches of 500
batch_size = 500
file_list = os.listdir(data_directory)

try:
    for i in range(0, len(file_list), batch_size):
        batch_files = file_list[i:i + batch_size]
        print(f'Processing batch: {i} batch')
        for filename in batch_files:
            if filename.endswith(".csv"):
                # Extract egoid and alterid from the filename
                fn = re.findall(r"\d+", filename)
                egoid = fn[0]
                print(f'Processing file {egoid}.csv')

                # Load the data
                df_data = pd.read_csv(os.path.join(data_directory, filename))
                timestamps = pd.to_datetime(df_data['timestamp'], format="%Y-%m-%d %H:%M:%S")
                timestamps_df = pd.DataFrame(timestamps)
                timestamps_df['timeDiff'] = (timestamps_df['timestamp'] - timestamps_df['timestamp'].min()).dt.total_seconds()/(3600*24)
                timestamps_df = timestamps_df.sort_values(by='timestamp')

                if len(timestamps_df) < 41:
                    print(f'LENGTH OUT OF BOUNDS, SKIPPING {filename}')
                    continue

                # Convert timestamps to event times
                time_series = timestamps_df.set_index('timestamp')['timeDiff']
                # print(time_series)
                # Fit ARIMA model
                model_fit = pm.auto_arima(time_series, suppress_warnings=True, seasonal=False)
                params = model_fit.get_params()
                p, d, q = params['order']
                print(f'p: {p}, d: {d}, q: {q}')
                summary = model_fit.summary()

                log_likelihood_match = re.search(r'Log Likelihood\s+(-?\d+\.\d+)', str(summary))
                aic_match = re.search(r'AIC\s+(-?\d+\.\d+)',  str(summary))

                if log_likelihood_match:
                    log_likelihood_value = float(log_likelihood_match.group(1))

                    print("Log-Likelihood:", log_likelihood_value)
                else:
                    log_likelihood_value = 0.0
                    print("Log-Likelihood NOT FOUND:", log_likelihood_value)

                if aic_match:
                    aic_match_value = float(aic_match.group(1))

                    print("aic:", aic_match_value)
                else:
                    aic_match_value = 0.0
                    print("aic NOT FOUND:", aic_match_value)




                # print(results.summary())
                # # Extract ARIMA parameters
                # p, d, q = results.p, results.d, results.q

                # Append results to the result DataFrame
                result_df = result_df.append({
                    'egoid': egoid,
                    
                    'p': p,
                    'd': d,
                    'q': q,
                    'log likelihood': log_likelihood_value,
                    'AIC': aic_match_value,
                }, ignore_index=True)
except :
    print("Process interrupted. Saving current results...")
    result_df.to_csv('ARIMAResultFits_interrupted_Egoid_ONLY.csv', index=False)
    print("Results saved.")
    raise  # Re-raise the KeyboardInterrupt to exit the program

# Save the result DataFrame to a CSV file
result_df.to_csv('ARIMAResultFits_Egoid_ONLY.csv', index=False)


Processing batch: 0 batch
Processing file 10153.csv
p: 4, d: 2, q: 2
Log-Likelihood: -5167.445
aic: 10348.89
Processing file 10237.csv
p: 5, d: 2, q: 1
Log-Likelihood: 417.547
aic: -819.094
Processing file 10469.csv
p: 1, d: 2, q: 1
Log-Likelihood: -4422.169
aic: 8850.337
Processing file 10732.csv
p: 5, d: 2, q: 1
Log-Likelihood: 13735.814
aic: -27457.627
Processing file 11002.csv
