In [74]:
# !pip install -U "pandas==2.1.0"

In [75]:
import os 
import pandas as pd 

In [76]:
print(pd.__version__)

2.1.0


In [41]:
SHORT_DATASET_DIR = "unprocessed_emails/short_emails_dataset"
FULL_DATASET_DIR = "unprocessed_emails/full_emails_dataset/NEUEBEISPIELMENGE/"

## Structure of the original dataset

The datasets, both short and full versions, contain folders with their names representing the category of emails they contain. For example, RECHNUNHEN (invoices), MAHNUNG (reminder), STORNO (cancellation) etc. Each of these folders can contain either .txt emails or other subfolders with emails. For example, the folder RECHNUNGEN contains the ERSTRECHNUNGSANFRAGE subfolder containing emails of this subcategory.

```
.
└── short_emails_dataset/ 
    ├── 1000_RECHNUNGEN/
    │   ├── 101_ERSTRECHNUNGSANFRAGE/
    │   │   ├── ABL4.TXT
    │   │   ├── ABL5.TXT 
    │   │   └── ...
    │   ├── 102_RECHNUNGSANFRAGEN_ALLGEM/
    │   │   ├── ABL1.TXT 
    │   │   ├── ABL3.TXT 
    │   │   └── ...
    │   └── ...
    ├── 1400_MAHNUNG/
    │   ├── 140_MAHNUNG/
    │   │   ├── ABL32.TXT 
    │   │   ├── ABL125.TXT 
    │   │   └── ...
    │   ├── ...
    │   └── ABL34.TXT 
    └── 2110_STORNO/
        ├── ABL41.TXT
        ├── ABL79.TXT
        └── ...
```

## Short dataset generation (6 categories)

In [69]:
# create a dictionary containing info about letters of 6 different categories

email_data = {
    'Category':[],
    'Content':[]
}


# function for storing emails to python dictionary 

for folder in os.listdir(SHORT_DATASET_DIR):

    no_num = folder.split('_')[1:]
    clean_folder = '_'.join(no_num) 

    fold = os.path.join(SHORT_DATASET_DIR, folder)

    for file in os.listdir(fold):
        
        # check if it's a folder
        file_item = os.path.join(fold, file)
        if os.path.isdir(file_item):
            for f in os.listdir(file_item):
                if f.endswith('.TXT'):

                    cont = os.path.join(file_item, f)

                    with open(cont, 'r', encoding='latin-1') as f:
                        cont = f.read()

                    email_data['Category'].append(clean_folder)
                    email_data['Content'].append(cont)

        # check if it's a single .txt email
        if file.endswith('.TXT'):
            cont = os.path.join(fold, file)
            with open(cont, 'r', encoding='latin-1') as f:
                cont = f.read()
        
                email_data['Category'].append(clean_folder)
                email_data['Content'].append(cont)

# convert dict to DataFrane with keys being column names
df = pd.DataFrame(email_data)

# save DataFrame as a .csv file
df.to_csv('csv_datasets/short_dataset.csv')

In [70]:
# email categories in the short version of emails dataset
df['Category'].unique()

array(['TARIFE', 'RECHNUNGEN', 'MAHNUNGEN', 'STORNO', 'AKTIVIERUNG_SIM',
       'VERTRAEGE_UND_VEREINBARUN'], dtype=object)

Encoding was a big issue at first. Initially, I used utf-8 for reading the emails but it failed to read special German letters. <br>

Special letters:
<ol>
<li> Vowels with diacritics (ä, ö, ü)
<li> ẞ (Eszett letter)
</ol>

Eventually, I used latin-1 as a decoding method. It consists of 191 characters from the Latin script. 

In [58]:
# the first email in our dataframe
print(df['Content'][0])

An:	KUNDENSERVICE@EPLUS
Kopie:	 
Datum:	06.06.2001 08:46:53
Betreff:	E-Mail Beantwortung

	  



Anrede         : Frau
Name           : Sim Karte Gesperrt Thielsch Kai
Handyrufnummer : 49 178 8820181
Account Nummer :
Handymodell    : 6150
Provider       : E-Plus Service
Email          : kaithielsch@gmx.de
-------------------------------------
Nachricht      : Liebes e plus Team
Ich habe aus versehen meinen Pin und den darauffolgenden "puk" falsch 
eingegeben
Nun ist meine sim karte gesperrt
Wie kann ich einen neuen Pin bekommen damit mein handy wieder 
einsatzbereit ist??

Vielen dank im vorraus
Mit freundlichen grüßen
Kai Thielsch
-------------------------------------
--444043493.991813613145.JavaMail.nsuser@apps3mn1--




This email was decoded correctly (pay attention to diacritics and Eszett letter).


## Full dataset generation (21 categories, no subcategories)

In [42]:
for folder in os.listdir(FULL_DATASET_DIR):
    print(folder)

ROLLEN_VORGANG.XLS
2500_R_KUNDEN
4100_UPGRADE_ERSATZ_GUTSCHRIFT
2900_TARIFE
VORGANGSLISTE.XLS
1000_RECHNUNGEN
1400_MAHNUNGEN
4200_NETZ
7000_AKTIONEN
3100_NON_VOICE_DIENSTE
4000_GERAETE_UND_ZUBEHOER
6000_VERTRIEBSPARTNER
8000_PRAEVENTION
2400_KUENDIGUNGEN
2110_STORNO
2700_AKTIVIERUNG_SIM
5000_FREE___EASY
3400_KUNDENBETREUUNG_ONLINE
3300_E_PLUS_ONLINE
2100_VERTRAEGE_UND_VEREINBARUN
2600_TEILNEHMERSTAMMDATEN
EPLUS_EMS.IPR
3200_MEHRWERTDIENSTE
3000_NETZDIENSTE


In [19]:
# create a dictionary which will contain info (Category and the email itself) 
# about letters of 21 categories
email_data = {
    'Category':[],
    'Content':[]
}


# storing emails to python dictionary 
for folder in os.listdir(FULL_DATASET_DIR):
        
    if os.path.isdir(os.path.join(FULL_DATASET_DIR, folder)):

        no_num = folder.split('_')[1:]
        clean_folder = '_'.join(no_num) 

        fold = os.path.join(FULL_DATASET_DIR, folder)

        for file in os.listdir(fold):
            
            # check if it's a folder
            file_item = os.path.join(fold, file)
            if os.path.isdir(file_item):
                for f in os.listdir(file_item):
                    if f.endswith('.TXT'):

                        cont = os.path.join(file_item, f)

                        with open(cont, 'r', encoding='latin-1') as f:
                            cont = f.read()

                        email_data['Category'].append(clean_folder)
                        email_data['Content'].append(cont)

            # check if it's a single .txt email
            if file.endswith('.TXT'):
                cont = os.path.join(fold, file)
                with open(cont, 'r', encoding='latin-1') as f:
                    cont = f.read()
            
                    email_data['Category'].append(clean_folder)
                    email_data['Content'].append(cont)

# convert dictionary to df
df = pd.DataFrame(email_data)

# save df as .csv file
df.to_csv('csv_datasets/full_dataset.csv')

In [20]:
df['Category'].unique()

array(['R_KUNDEN', 'UPGRADE_ERSATZ_GUTSCHRIFT', 'TARIFE', 'RECHNUNGEN',
       'MAHNUNGEN', 'NETZ', 'AKTIONEN', 'NON_VOICE_DIENSTE',
       'GERAETE_UND_ZUBEHOER', 'VERTRIEBSPARTNER', 'PRAEVENTION',
       'KUENDIGUNGEN', 'STORNO', 'AKTIVIERUNG_SIM', 'FREE___EASY',
       'KUNDENBETREUUNG_ONLINE', 'E_PLUS_ONLINE',
       'VERTRAEGE_UND_VEREINBARUN', 'TEILNEHMERSTAMMDATEN',
       'MEHRWERTDIENSTE', 'NETZDIENSTE'], dtype=object)

In [21]:
df['Category'].nunique()

21

## Full dataset generation (122 categories)

In [78]:
# create a dictionary which will contain 
email_data = {
    'Category':[],
    'Content':[]
}


# function for storing emails to python dictionary 
for folder in os.listdir(FULL_DATASET_DIR):
        
    if os.path.isdir(os.path.join(FULL_DATASET_DIR, folder)):

        no_num = folder.split('_')[1:]
        clean_folder = '_'.join(no_num) 

        fold = os.path.join(FULL_DATASET_DIR, folder)

        for file in os.listdir(fold):
            
            # check if it's a folder
            file_item = os.path.join(fold, file)
            if os.path.isdir(file_item):
                for f in os.listdir(file_item):
                    if f.endswith('.TXT'):

                        cont = os.path.join(file_item, f)

                        x = file_item.split('/')[-1]

                        clean_fold = '_'.join(x.split('_')[1:])

                        with open(cont, 'r', encoding='latin-1') as f:
                            cont = f.read()

                        email_data['Category'].append(clean_fold)
                        email_data['Content'].append(cont)

            # check if it's a single .txt email
            if file.endswith('.TXT'):
                cont = os.path.join(fold, file)
                with open(cont, 'r', encoding='latin-1') as f:
                    cont = f.read()
            
                    email_data['Category'].append(clean_folder)
                    email_data['Content'].append(cont)

# convert dict to df 
df = pd.DataFrame(email_data)

# save df as .csv file
df.to_csv('csv_datasets/full_dataset_with_subcategories.csv')

In [79]:
# the number of categories in this dataset
df['Category'].nunique()

122

## 

Now, we have 3 datasets which we will test using different algorithms and models. 

Datasets:
<ol>
<li> Short dataset (6 categories)
<li> Full dataset (21 categories, subcategories excluded)
<li> Full dataset (122 categories, subcategories counted as separate categories)
</ol>