In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('vader_lexicon')

# Ensuring reproducibility in langdetect
DetectorFactory.seed = 0

# Load your Excel file
file_path = 'all_facebook_and_twitter_dataset.xlsx'

# Try loading the file with different engines based on extension
try:
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path, engine='openpyxl')
    elif file_path.endswith('.xls'):
        df = pd.read_excel(file_path, engine='xlrd')
    else:
        raise ValueError("File format not supported. Please provide a .xls or .xlsx file.")
except Exception as e:
    print(f"Error loading the Excel file: {e}")
    raise

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Sentiment analysis setup
sia = SentimentIntensityAnalyzer()

# Function to get sentiment
def get_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply language detection and sentiment analysis
df['Language'] = df['Comments'].apply(detect_language)
df['Word_Count'] = df['Comments'].apply(lambda x: len(word_tokenize(x)))
df['Sentiment'] = df['Comments'].apply(get_sentiment)

# Count English and Amharic words separately
def count_language_words(text, language):
    words = word_tokenize(text)
    language_words = [word for word in words if detect_language(word) == language]
    return len(language_words)

df['English_Words'] = df['Comments'].apply(lambda x: count_language_words(x, 'en'))

def count_amharic_words(text):
    # Define a regular expression pattern to match Amharic characters
    amharic_pattern = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF\uAB00-\uAB2F\uAB30-\uAB6F\uAB70-\uABBF\uABC0-\uABFF]+')
    # Find all matches of Amharic words in the text
    amharic_matches = re.findall(amharic_pattern, text)
    # Return the count of Amharic words
    return len(amharic_matches)

# Apply the function to count Amharic words in each comment
df['Amharic_Words'] = df['Comments'].apply(count_amharic_words)

# Determine the primary language of the comment
def determine_primary_language(row):
    if row['English_Words'] > 0 and row['Amharic_Words'] > 0:
        return 'Mixed'
    elif row['Amharic_Words'] > 0:
        return 'Amharic'
    elif row['English_Words'] > 0:
        return 'English'
    else:
        return 'Unknown'

df['Primary_Language'] = df.apply(determine_primary_language, axis=1)

# Further categorize unknown comments
def reclassify_unknown_comments(row):
    if row['Primary_Language'] == 'Unknown':
        english_word_count = count_language_words(row['Comments'], 'en')
        amharic_word_count = count_amharic_words(row['Comments'])
        if english_word_count > 0 and amharic_word_count > 0:
            return 'Mixed'
        elif amharic_word_count > 0:
            return 'Amharic'
        elif english_word_count > 0:
            return 'English'
        else:
            return 'Unknown'
    else:
        return row['Primary_Language']

df['Reclassified_Language'] = df.apply(reclassify_unknown_comments, axis=1)

# Calculating statistics
total_comments = len(df)
total_words = df['Word_Count'].sum()
english_words = df['English_Words'].sum()
amharic_words = df['Amharic_Words'].sum()
positive_comments = len(df[df['Sentiment'] == 'Positive'])
negative_comments = len(df[df['Sentiment'] == 'Negative'])
neutral_comments = len(df[df['Sentiment'] == 'Neutral'])

stats = {
    'Total Data Amount in Sentences': total_comments,
    'Total Amount in Words': total_words,
    'Number of English Words in Comments': english_words,
    'Number of Amharic Words in Comments': amharic_words,
    'Number of Positive Comments': positive_comments,
    'Number of Negative Comments': negative_comments,
    'Number of Neutral Comments': neutral_comments
}

# Calculate counts and percentages of each primary language category
language_counts = df['Reclassified_Language'].value_counts()
language_percentages = df['Reclassified_Language'].value_counts(normalize=True) * 100

# Print statistics
print("Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

print("\nLanguage Statistics:")
print("Number of Amharic Comments:", language_counts.get('Amharic', 0))
print("Percentage of Amharic Comments: {:.2f}%".format(language_percentages.get('Amharic', 0)))
print("Number of English Comments:", language_counts.get('English', 0))
print("Percentage of English Comments: {:.2f}%".format(language_percentages.get('English', 0)))
print("Number of Mixed Comments:", language_counts.get('Mixed', 0))
print("Percentage of Mixed Comments: {:.2f}%".format(language_percentages.get('Mixed', 0)))
print("Number of Unknown Comments:", language_counts.get('Unknown', 0))
print("Percentage of Unknown Comments: {:.2f}%".format(language_percentages.get('Unknown', 0)))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Statistics:
Total Data Amount in Sentences: 13389
Total Amount in Words: 134813
Number of English Words in Comments: 7886
Number of Amharic Words in Comments: 99094
Number of Positive Comments: 2373
Number of Negative Comments: 711
Number of Neutral Comments: 10305

Language Statistics:
Number of Amharic Comments: 8681
Percentage of Amharic Comments: 64.84%
Number of English Comments: 2654
Percentage of English Comments: 19.82%
Number of Mixed Comments: 858
Percentage of Mixed Comments: 6.41%
Number of Unknown Comments: 1196
Percentage of Unknown Comments: 8.93%
