In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
try:
    # Attempt to load the CSV file with ISO-8859-1 encoding
    df = pd.read_csv('finaldata.csv', encoding='ISO-8859-1')
    print(df.head())  # Display the first few rows to check the content
except Exception as e:
    print(f"An error occurred: {e}")


   No   _address                                              _body  \
0   1  AX-FEDBNK  Rs 40.00 debited from your A/c using UPI on 01...   
1   2  AX-FEDBNK  Rs 50.00 debited from your A/c using UPI on 01...   
2   3  AD-FEDBNK  Rs 75.00 debited from your A/c using UPI on 01...   
3   4  AD-FEDBNK  Rs 200.00 debited from your A/c using UPI on 0...   
4   5  AX-FEDBNK  Rs 255.00 debited from your A/c using UPI on 0...   

               date  
0  01-08-2023 08:51  
1  01-08-2023 11:56  
2  01-08-2023 12:05  
3  01-08-2023 17:09  
4  02-08-2023 16:25  


In [3]:
import re
# Define a function to extract amount and action from the message
def extract_details(text):
    # Regex to find the amount pattern and the action "debited"
    amount_match = re.search(r'Rs (\d+\.?\d*)', text)
    action_match = re.search(r'(debited|credited)', text, re.IGNORECASE)

    amount = amount_match.group(1) if amount_match else None
    action = action_match.group(1).lower() if action_match else None
    return pd.Series([amount, action])

# Apply the function to the '_body' column and create new columns
df[['Amount', 'Action']] = df['_body'].apply(extract_details)

# Show the modified DataFrame
print(df[['No', '_address', '_body', 'date', 'Amount', 'Action']].head())


   No   _address                                              _body  \
0   1  AX-FEDBNK  Rs 40.00 debited from your A/c using UPI on 01...   
1   2  AX-FEDBNK  Rs 50.00 debited from your A/c using UPI on 01...   
2   3  AD-FEDBNK  Rs 75.00 debited from your A/c using UPI on 01...   
3   4  AD-FEDBNK  Rs 200.00 debited from your A/c using UPI on 0...   
4   5  AX-FEDBNK  Rs 255.00 debited from your A/c using UPI on 0...   

               date  Amount   Action  
0  01-08-2023 08:51   40.00  debited  
1  01-08-2023 11:56   50.00  debited  
2  01-08-2023 12:05   75.00  debited  
3  01-08-2023 17:09  200.00  debited  
4  02-08-2023 16:25  255.00  debited  


In [4]:
# Create a new column 'Transaction_Type' based on the 'Action' column
df['Transaction_Type'] = df['Action'].apply(lambda x: 1 if x == 'credited' else 0)

# Show the modified DataFrame with the new binary column
print(df[['No', '_address', '_body', 'date', 'Amount', 'Action', 'Transaction_Type']].head())


   No   _address                                              _body  \
0   1  AX-FEDBNK  Rs 40.00 debited from your A/c using UPI on 01...   
1   2  AX-FEDBNK  Rs 50.00 debited from your A/c using UPI on 01...   
2   3  AD-FEDBNK  Rs 75.00 debited from your A/c using UPI on 01...   
3   4  AD-FEDBNK  Rs 200.00 debited from your A/c using UPI on 0...   
4   5  AX-FEDBNK  Rs 255.00 debited from your A/c using UPI on 0...   

               date  Amount   Action  Transaction_Type  
0  01-08-2023 08:51   40.00  debited                 0  
1  01-08-2023 11:56   50.00  debited                 0  
2  01-08-2023 12:05   75.00  debited                 0  
3  01-08-2023 17:09  200.00  debited                 0  
4  02-08-2023 16:25  255.00  debited                 0  


In [5]:
# Check for missing values in each column
print(df.isnull().sum())

# Handling missing values
# For numerical columns, fill missing values with the median or meandf['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
df['Amount'] = df['Amount'].fillna(df['Amount'].median())

# For categorical columns, you might consider filling missing values with the mode or a placeholder like 'Unknown'
df['Action'] = df['Action'].fillna(df['Action'].mode()[0])

# If 'Action' has very few missing values, consider dropping those rows
df.dropna(subset=['Action'], inplace=True)


No                    0
_address              0
_body                 0
date                  0
Amount              594
Action              234
Transaction_Type      0
dtype: int64


In [6]:
# Assuming 'Action' is the only categorical variable that needs encoding
df['Action_Code'] = df['Action'].astype('category').cat.codes

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization
scaler = MinMaxScaler()
df['Amount_Normalized'] = scaler.fit_transform(df[['Amount']])

# Standardization
scaler = StandardScaler()
df['Amount_Standardized'] = scaler.fit_transform(df[['Amount']])

In [8]:
# Review the DataFrame
print(df.head())

# Export the cleaned and preprocessed DataFrame
df.to_csv('finaldata_cleaned.csv', index=False)

   No   _address                                              _body  \
0   1  AX-FEDBNK  Rs 40.00 debited from your A/c using UPI on 01...   
1   2  AX-FEDBNK  Rs 50.00 debited from your A/c using UPI on 01...   
2   3  AD-FEDBNK  Rs 75.00 debited from your A/c using UPI on 01...   
3   4  AD-FEDBNK  Rs 200.00 debited from your A/c using UPI on 0...   
4   5  AX-FEDBNK  Rs 255.00 debited from your A/c using UPI on 0...   

               date  Amount   Action  Transaction_Type  Action_Code  \
0  01-08-2023 08:51    40.0  debited                 0            1   
1  01-08-2023 11:56    50.0  debited                 0            1   
2  01-08-2023 12:05    75.0  debited                 0            1   
3  01-08-2023 17:09   200.0  debited                 0            1   
4  02-08-2023 16:25   255.0  debited                 0            1   

   Amount_Normalized  Amount_Standardized  
0           0.001444            -0.163380  
1           0.001815            -0.155692  
2           0.

In [9]:
import pandas as pd

# Load your cleaned data
df = pd.read_csv('finaldata_cleaned.csv', encoding='ISO-8859-1')

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Adjust format if necessary

# Check the conversion
print(df['date'].head())


0   2023-01-08 08:51:00
1   2023-01-08 11:56:00
2   2023-01-08 12:05:00
3   2023-01-08 17:09:00
4   2023-02-08 16:25:00
Name: date, dtype: datetime64[ns]


In [10]:
# Extract year and month from the date for grouping data later
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month

In [11]:
# Group by year and month, then summarize spending
monthly_spend = df.groupby(['Year', 'Month'])['Amount'].agg(['sum', 'mean']).reset_index()
monthly_spend.columns = ['Year', 'Month', 'Total_Spend', 'Average_Spend']

print(monthly_spend.head())


     Year  Month  Total_Spend  Average_Spend
0  2023.0    1.0      1927.95     148.303846
1  2023.0    2.0       320.00      80.000000
2  2023.0    3.0      1574.00     131.166667
3  2023.0    4.0       262.00      52.400000
4  2023.0    5.0        45.00      45.000000


In [12]:
# Overall average transaction size
average_transaction_size = df['Amount'].mean()
print(f"Average Transaction Size: {average_transaction_size}")


Average Transaction Size: 252.51754468485422


In [13]:
# Count number of transactions per category
transactions_per_category = df['Action'].value_counts().reset_index()
transactions_per_category.columns = ['Action', 'Transaction_Count']

print(transactions_per_category)


     Action  Transaction_Count
0   debited                938
1  credited                125


In [14]:
# Merging monthly data back to the main DataFrame (if necessary)
df = df.merge(monthly_spend, on=['Year', 'Month'], how='left')

# Save the enhanced DataFrame
df.to_csv('finaldata_enhanced.csv', index=False)


In [15]:
# Fill missing values with the mean or median
df['Amount'].fillna(df['Amount'].median(), inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Amount'].fillna(df['Amount'].median(), inplace=True)


In [16]:
def categorize_by_amount(amount):
    if amount < 50:
        return 'Miscellaneous'
    elif 100 <= amount < 500:
        return 'Food/Fuel/Shopping'
    elif amount >= 500:
        return 'Rent/High-Value Purchases'
    else:
        return 'Other'

# Ensure the 'Amount' column is numeric
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

# Apply the function to create a new column
df['Amount_Category'] = df['Amount'].apply(categorize_by_amount)

In [17]:
# Ensure the 'Amount' column is numeric
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

# Apply the function to create a new column
df['Amount_Category'] = df['Amount'].apply(categorize_by_amount)

# Check the new categorization
print(df[['Amount', 'Amount_Category']].head())

   Amount     Amount_Category
0    40.0       Miscellaneous
1    50.0               Other
2    75.0               Other
3   200.0  Food/Fuel/Shopping
4   255.0  Food/Fuel/Shopping


In [18]:
# Count the number of transactions per category
category_counts = df['Amount_Category'].value_counts()
print(category_counts)

# Mean spending per category
mean_spending = df.groupby('Amount_Category')['Amount'].mean()
print(mean_spending)


Amount_Category
Other                        676
Miscellaneous                185
Food/Fuel/Shopping           134
Rent/High-Value Purchases     68
Name: count, dtype: int64
Amount_Category
Food/Fuel/Shopping            198.090000
Miscellaneous                  24.605135
Other                          69.656805
Rent/High-Value Purchases    2797.678529
Name: Amount, dtype: float64


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('finaldata_enhanced.csv')

# Optional: Handle missing values if not previously handled
data.dropna(subset=['Year', 'Month', 'Total_Spend', 'Average_Spend', 'Amount_Normalized'], inplace=True)

# Select features and target variable
X = data[['Year', 'Month', 'Amount_Normalized']]  # Features
y = data['Total_Spend']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
r_squared = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the evaluation metrics
print(f'R-squared: {r_squared:.2f}')
print(f'Mean Squared Error: {mse:.2f}')


R-squared: 0.20
Mean Squared Error: 6079770.51


In [20]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('finaldata_enhanced.csv')

# Check for any NaN values in the dataset
print("Initial NaN counts in each column:")
print(data.isna().sum())

# Check for any infinite values in the dataset
print("Initial Inf counts in each column:")
print(data.replace([np.inf, -np.inf], np.nan).isna().sum() - data.isna().sum())

# Replace inf/-inf with NaN and remove any rows with NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Optionally, save the cleaned data
data.to_csv('cleaned_data.csv', index=False)

Initial NaN counts in each column:
No                       0
_address                 0
_body                    0
date                   831
Amount                   0
Action                   0
Transaction_Type         0
Action_Code              0
Amount_Normalized        0
Amount_Standardized      0
Year                   831
Month                  831
Total_Spend            831
Average_Spend          831
dtype: int64
Initial Inf counts in each column:
No                     0
_address               0
_body                  0
date                   0
Amount                 0
Action                 0
Transaction_Type       0
Action_Code            0
Amount_Normalized      0
Amount_Standardized    0
Year                   0
Month                  0
Total_Spend            0
Average_Spend          0
dtype: int64


In [21]:
from sklearn.preprocessing import MinMaxScaler

# Prepare features and target
X = data[['Year', 'Month', 'Amount_Normalized']]
y = data['Total_Spend']

# Apply MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal

# Define the neural network
model = Sequential([
    Dense(32, kernel_initializer=HeNormal(), input_shape=(X_scaled.shape[1],)),
    LeakyReLU(alpha=0.01),
    Dense(32, kernel_initializer=HeNormal()),
    LeakyReLU(alpha=0.01),
    Dense(1)  # Output layer with one unit for regression
])

# Compile the model with a conservative learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
# Apply log scaling to the target variable to reduce the range of values
y_log = np.log1p(y)  # log1p is used to handle zero values smoothly


In [24]:
# Simpler model
model = Sequential([
    Dense(16, kernel_initializer=HeNormal(), input_shape=(X_train.shape[1],)),
    LeakyReLU(alpha=0.01),
    Dense(1)
])

# More complex model
model = Sequential([
    Dense(64, kernel_initializer=HeNormal(), input_shape=(X_train.shape[1],)),
    LeakyReLU(alpha=0.01),
    Dense(64, kernel_initializer=HeNormal()),
    LeakyReLU(alpha=0.01),
    Dense(1)
])


In [25]:
from tensorflow.keras.optimizers import Adam, SGD

# Using a different optimizer with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.00001), loss='mean_squared_error')

# Or using SGD which might be more stable in some cases
model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='mean_squared_error')


In [26]:
import pandas as pd

# Load the dataset
df = pd.read_csv('finaldata_enhanced.csv')

# Check the first few rows of the dataset
print(df.head())

# General information and statistics
print(df.info())
print(df.describe())


   No   _address                                              _body  \
0   1  AX-FEDBNK  Rs 40.00 debited from your A/c using UPI on 01...   
1   2  AX-FEDBNK  Rs 50.00 debited from your A/c using UPI on 01...   
2   3  AD-FEDBNK  Rs 75.00 debited from your A/c using UPI on 01...   
3   4  AD-FEDBNK  Rs 200.00 debited from your A/c using UPI on 0...   
4   5  AX-FEDBNK  Rs 255.00 debited from your A/c using UPI on 0...   

                  date  Amount   Action  Transaction_Type  Action_Code  \
0  2023-01-08 08:51:00    40.0  debited                 0            1   
1  2023-01-08 11:56:00    50.0  debited                 0            1   
2  2023-01-08 12:05:00    75.0  debited                 0            1   
3  2023-01-08 17:09:00   200.0  debited                 0            1   
4  2023-02-08 16:25:00   255.0  debited                 0            1   

   Amount_Normalized  Amount_Standardized    Year  Month  Total_Spend  \
0           0.001444            -0.163380  2023.0    1.

In [27]:
import pandas as pd

# Load your data
df = pd.read_csv('finaldata_enhanced.csv')

# Function to categorize spending based on amount
def categorize_spending(amount):
    if amount < 50:
        return 'Miscellaneous'
    elif 50 <= amount < 500:
        return 'Food/Fuel/Shopping'
    elif amount >= 500:
        return 'Large Expense/Rent'
    else:
        return 'Other'

# Apply the function to create a new category column
df['Category'] = df['Amount'].apply(categorize_spending)


In [28]:
# Save the updated DataFrame
df.to_csv('upfinaldata_enhanced.csv', index=False)


In [29]:
from flask import Flask


In [30]:
app = Flask(__name__)


In [31]:
@app.route('/')
def home():
    return "Hello, World!"


In [32]:
if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
