In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import string
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Custom transformer for text cleaning
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        def clean_text(text):
            text = text.lower()  # Lowercase text
            text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
            text = re.sub(r'\d+', '', text)  # Remove digits
            text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stop words
            return text

        return X.apply(clean_text)

# Define the file path
file_path = '.../ed_data.csv'

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path)

# Replace "Blank" with NaN in CAUSE1, CAUSE2, CAUSE3
df['CAUSE1'].replace("Blank", pd.NA, inplace=True)
df['CAUSE2'].replace("Blank", pd.NA, inplace=True)
df['CAUSE3'].replace("Blank", pd.NA, inplace=True)

# Merge RFV1-RFV5 into one variable
df['RFV'] = df[['RFV1', 'RFV2', 'RFV3', 'RFV4', 'RFV5']].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Merge CAUSE1-CAUSE3 into one variable
df['CAUSE'] = df[['CAUSE1', 'CAUSE2', 'CAUSE3']].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Drop the original RFV1-RFV5 and CAUSE1-CAUSE3 columns after cleaning
df.drop(columns=['RFV1', 'RFV2', 'RFV3', 'RFV4', 'RFV5', 'CAUSE1', 'CAUSE2', 'CAUSE3'], inplace=True)

# Initialize the text cleaner
text_cleaner = TextCleaner()

# Clean the 'RFV' and 'CAUSE' columns
df['RFV_clean'] = text_cleaner.transform(df['RFV'])
df['CAUSE_clean'] = text_cleaner.transform(df['CAUSE'])

# Combine the cleaned 'RFV_clean' and 'CAUSE_clean' columns into one column
df['Combined_clean'] = df['RFV_clean'] + ' ' + df['CAUSE_clean']

# Combine the text from the 'Combined_clean' column for word cloud and frequency analysis
combined_text = ' '.join(df['Combined_clean'].dropna())

df.drop(columns=['RFV_clean', 'RFV_clean', 'RFV', 'CAUSE', 'CAUSE_clean'], inplace=True)

# Generate word frequencies
word_freq = Counter(combined_text.split())

# Filter words with frequency over 1500
filtered_word_freq = {word: freq for word, freq in word_freq.items() if freq > 800}
filtered_word_freq1 = {word: freq for word, freq in word_freq.items() if freq > 200}

# Sort the filtered word frequencies by frequency
sorted_word_freq = dict(sorted(filtered_word_freq.items(), key=lambda item: item[1], reverse=True))
sorted_word_freq1 = dict(sorted(filtered_word_freq1.items(), key=lambda item: item[1], reverse=True))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CAUSE1'].replace("Blank", pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CAUSE2'].replace("Blank", pd.NA, inplace=True)
The behavio

In [None]:
output_file_path = '.../cleaned_ed_data.csv'
df.to_csv(output_file_path, index=False)