In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Download necessary NLTK resources
nltk.download('punkt')

# Load your Excel file
file_path = 'all_facebook_and_twitter_dataset.xlsx'

# Try loading the file with different engines based on extension
try:
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path, engine='openpyxl')
    elif file_path.endswith('.xls'):
        df = pd.read_excel(file_path, engine='xlrd')
    else:
        raise ValueError("File format not supported. Please provide a .xls or .xlsx file.")
except Exception as e:
    print(f"Error loading the Excel file: {e}")
    raise

# Function to tokenize comments
def tokenize_comment(comment):
    return word_tokenize(comment)

# Function to generate n-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Apply tokenization and n-gram generation
df['Tokens'] = df['Comments'].apply(tokenize_comment)

# Choose the value of n for n-grams
n = 2  # For example, 2 for bigrams, 3 for trigrams, etc.
df['N-Grams'] = df['Tokens'].apply(lambda tokens: generate_ngrams(tokens, n))

# Print tokens and n-grams for each comment
for index, row in df.iterrows():
    print(f"Comment: {row['Comments']}")
    print(f"Tokens: {row['Tokens']}")
    print(f"{n}-Grams: {row['N-Grams']}")
    print("\n")

# If you need to save the tokens and n-grams back to the Excel file
output_file_path = 'all_facebook_and_twitter_dataset_with_ngrams.xlsx'
df.to_excel(output_file_path, index=False, engine='openpyxl')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Comment: እባካችሁ እለታዊ ፖኬጅ ላይም ማሻሻያ አድርጉ ለእኛ ለድሆቹም አስቡ እንጂ
Tokens: ['እባካችሁ', 'እለታዊ', 'ፖኬጅ', 'ላይም', 'ማሻሻያ', 'አድርጉ', 'ለእኛ', 'ለድሆቹም', 'አስቡ', 'እንጂ']
2-Grams: [('እባካችሁ', 'እለታዊ'), ('እለታዊ', 'ፖኬጅ'), ('ፖኬጅ', 'ላይም'), ('ላይም', 'ማሻሻያ'), ('ማሻሻያ', 'አድርጉ'), ('አድርጉ', 'ለእኛ'), ('ለእኛ', 'ለድሆቹም'), ('ለድሆቹም', 'አስቡ'), ('አስቡ', 'እንጂ')]


Comment: መቼነው ቅናሹ የሚጀምረው wifi የመኖሪያባለ mg ባለ mg የነበረው ስንት ሚቀንሰውስ
Tokens: ['መቼነው', 'ቅናሹ', 'የሚጀምረው', 'wifi', 'የመኖሪያባለ', 'mg', 'ባለ', 'mg', 'የነበረው', 'ስንት', 'ሚቀንሰውስ']
2-Grams: [('መቼነው', 'ቅናሹ'), ('ቅናሹ', 'የሚጀምረው'), ('የሚጀምረው', 'wifi'), ('wifi', 'የመኖሪያባለ'), ('የመኖሪያባለ', 'mg'), ('mg', 'ባለ'), ('ባለ', 'mg'), ('mg', 'የነበረው'), ('የነበረው', 'ስንት'), ('ስንት', 'ሚቀንሰውስ')]


Comment: የቴሌን የተወሰነ ለግሉ ዘርፍ ይሸጣል የተባለው የት ደረሰ ሀገሪቱ ተፎካካሪ የሆነ ሌላ አማራጭ ኔትወርክ ከሌላ መቼም የኢትዮጵያ ቴሌኮም ለደንበኛው ትኩረት የታሪፍ ማሻሻያ አያደርግም
Tokens: ['የቴሌን', 'የተወሰነ', 'ለግሉ', 'ዘርፍ', 'ይሸጣል', 'የተባለው', 'የት', 'ደረሰ', 'ሀገሪቱ', 'ተፎካካሪ', 'የሆነ', 'ሌላ', 'አማራጭ', 'ኔትወርክ', 'ከሌላ', 'መቼም', 'የኢትዮጵያ', 'ቴሌኮም', 'ለደንበኛው', 'ትኩረት', 'የታሪፍ', 'ማሻሻያ', 'አያደርግም']
2-Grams: [('የቴሌን', 'የተ

Comment: ያተረፋቹት ከተጠቃሚው ስለዚ ለተጠቃሚው የሆነ ሚሊየን አትሰጡም
Tokens: ['ያተረፋቹት', 'ከተጠቃሚው', 'ስለዚ', 'ለተጠቃሚው', 'የሆነ', 'ሚሊየን', 'አትሰጡም']
2-Grams: [('ያተረፋቹት', 'ከተጠቃሚው'), ('ከተጠቃሚው', 'ስለዚ'), ('ስለዚ', 'ለተጠቃሚው'), ('ለተጠቃሚው', 'የሆነ'), ('የሆነ', 'ሚሊየን'), ('ሚሊየን', 'አትሰጡም')]


Comment: እባካችሁ ወደገጠር አከባቢ ምንም network እየሰራ አይደልም ስልኩ ክፍት እያለ ጭራሽ ዝግ ይላል እባካችሁ ዳታም ሆነ ኢንተርነት መጠቀም አልቻልንም እባካችሁ አስተካክሉልን መመስገኑ ካልቀረ ያው በሁሉም ተመሰገኑ እንጂ ባንደኛው ተወድሳችሁ በሌላኛው ደግሞ አትተቹ እንጂ
Tokens: ['እባካችሁ', 'ወደገጠር', 'አከባቢ', 'ምንም', 'network', 'እየሰራ', 'አይደልም', 'ስልኩ', 'ክፍት', 'እያለ', 'ጭራሽ', 'ዝግ', 'ይላል', 'እባካችሁ', 'ዳታም', 'ሆነ', 'ኢንተርነት', 'መጠቀም', 'አልቻልንም', 'እባካችሁ', 'አስተካክሉልን', 'መመስገኑ', 'ካልቀረ', 'ያው', 'በሁሉም', 'ተመሰገኑ', 'እንጂ', 'ባንደኛው', 'ተወድሳችሁ', 'በሌላኛው', 'ደግሞ', 'አትተቹ', 'እንጂ']
2-Grams: [('እባካችሁ', 'ወደገጠር'), ('ወደገጠር', 'አከባቢ'), ('አከባቢ', 'ምንም'), ('ምንም', 'network'), ('network', 'እየሰራ'), ('እየሰራ', 'አይደልም'), ('አይደልም', 'ስልኩ'), ('ስልኩ', 'ክፍት'), ('ክፍት', 'እያለ'), ('እያለ', 'ጭራሽ'), ('ጭራሽ', 'ዝግ'), ('ዝግ', 'ይላል'), ('ይላል', 'እባካችሁ'), ('እባካችሁ', 'ዳታም'), ('ዳታም', 'ሆነ'), ('ሆነ', 'ኢንተርነት'), ('ኢን

Comment: አረ እባክህ የሚበላው እያጣ አድርጉትና ሁሉም ኢትዮጵያዊ የአገሩን ሀብት በፍትሀዊነት የመጠቀም መብቱን አስከብሩ
Tokens: ['አረ', 'እባክህ', 'የሚበላው', 'እያጣ', 'አድርጉትና', 'ሁሉም', 'ኢትዮጵያዊ', 'የአገሩን', 'ሀብት', 'በፍትሀዊነት', 'የመጠቀም', 'መብቱን', 'አስከብሩ']
2-Grams: [('አረ', 'እባክህ'), ('እባክህ', 'የሚበላው'), ('የሚበላው', 'እያጣ'), ('እያጣ', 'አድርጉትና'), ('አድርጉትና', 'ሁሉም'), ('ሁሉም', 'ኢትዮጵያዊ'), ('ኢትዮጵያዊ', 'የአገሩን'), ('የአገሩን', 'ሀብት'), ('ሀብት', 'በፍትሀዊነት'), ('በፍትሀዊነት', 'የመጠቀም'), ('የመጠቀም', 'መብቱን'), ('መብቱን', 'አስከብሩ')]


Comment: ብሩ ፍጥነቱ ስንት
Tokens: ['ብሩ', 'ፍጥነቱ', 'ስንት']
2-Grams: [('ብሩ', 'ፍጥነቱ'), ('ፍጥነቱ', 'ስንት')]


Comment: safaricom ይጀምር ሌቦች
Tokens: ['safaricom', 'ይጀምር', 'ሌቦች']
2-Grams: [('safaricom', 'ይጀምር'), ('ይጀምር', 'ሌቦች')]


Comment: too expensive sometimes even network will aborted tele gets profit cost customer shame
Tokens: ['too', 'expensive', 'sometimes', 'even', 'network', 'will', 'aborted', 'tele', 'gets', 'profit', 'cost', 'customer', 'shame']
2-Grams: [('too', 'expensive'), ('expensive', 'sometimes'), ('sometimes', 'even'), ('even', 'network'), ('network', 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

