In [None]:
import string
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
tree = ET.parse('../data/spam-sms.xml')

root = tree.getroot()
records = []

for record in root.findall('Record'):
    row = {}
    for child in record:
        row[child.tag] = child.text
    records.append(row)

df = pd.DataFrame(records)

print(df)


                                                message
0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
...                                                 ...
5567  This is the 2nd time we have tried 2 contact u...
5568               Will ü b going to esplanade fr home?
5569  Pity, * was in mood for that. So...any other s...
5570  The guy did some bitching but I acted like i'd...
5571                         Rofl. Its true to its name

[5572 rows x 1 columns]


In [3]:
def preprocess_text(text):
    text = text.lower()
    for i in string.punctuation + '1234567890':
        text = text.replace(i, '')
    return text

df['preprocessed_messages'] = df['message'].apply(preprocess_text)

df

Unnamed: 0,message,preprocessed_messages
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,this is the nd time we have tried contact u u...
5568,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home
5569,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...
5570,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...


<details>
<summary style="font-family: vazir; color: #0099cc; font-size: medium; text-align: right; direction: rtl;">
معنی لغوی <code>lexicographical</code>
</summary>
<p style="direction: rtl; text-align: justify; font-family: vazir; font-size: medium; line-height: 200%;">
<font face="vazir">
ترتیب لغوی یا <code>lexicographical</code> مشابه ترتیب دیکشنری است، یعنی کلمات یا رشته‌ها براساس حروف الفبایی مرتب می‌شوند. برای مثال:
<ul style="direction: rtl; text-align: right;">
  <li>«apple» قبل از «banana» می‌آید چون حرف <code>a</code> از حرف <code>b</code> در حروف الفبا جلوتر است.</li>
  <li>اگر دو کلمه اولین و دومین حروف یکسانی داشته باشند، مقایسه به حرف سوم می‌رسد. برای مثال: «bar» قبل از «bat» قرار می‌گیرد چون بعد از مقایسه <code>b</code> و <code>a</code>، حرف سوم <code>r</code> از <code>t</code> جلوتر است.</li>
  <li>«cat» قبل از «dog» می‌آید چون <code>c</code> از <code>d</code> جلوتر است.</li>
</ul>
<p style="direction: rtl; text-align: justify; font-family: vazir; font-size: medium; line-height: 200%;">
این ترتیب برای حروف بزرگ و کوچک یا زبان‌های مختلف نیز ممکن است متفاوت باشد، ولی مفهوم کلی یکسان است.
</p>
</font>
</p>
</details>


In [4]:
def most_frequent_word(text):
    max_count = 0
    most_frequent = ''
    for i in text.split():
        if max_count < text.split().count(i):
            max_count = text.split().count(i)
            most_frequent = i
        elif max_count == text.split().count(i):
            most_frequent = min(most_frequent, i)
    return most_frequent

df['most_frequent_word'] = df['preprocessed_messages'].apply(most_frequent_word)

In [5]:
def max_length_word(text):
    max_len = 0
    most_len = ''
    for i in text.split():
        if max_len < len(i):
            max_len = len(i)
            most_len = i
        elif max_len == len(i):
            most_len = min(most_len, i)
    return most_len

df['max_length_word'] = df['preprocessed_messages'].apply(max_length_word)
df

Unnamed: 0,message,preprocessed_messages,most_frequent_word,max_length_word
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,amore,available
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,joking,joking
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...,to,questionstd
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,say,already
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,he,around
...,...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,this is the nd time we have tried contact u u...,have,btnationalrate
5568,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home,b,esplanade
5569,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,for,suggestions
5570,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,acted,interested
