<a href="https://colab.research.google.com/github/UCREL/Session1_Visualisation_and_Summarisation/blob/main/Extra-NoteBooks/ArabicNews_wordcloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


##Arabic News

Arabic news crawled from BBC, CNN, JSC, RT, and EuroNews
https://www.kaggle.com/datasets/mksaad/arabic-news/data

In [None]:
!pip install mglearn
!pip install gdown
!pip install wordcloud
!pip install plotly
!pip install spacy
!pip install spacy-langdetect
!pip install langdetect


In [None]:
import gdown
url = 'https://drive.google.com/uc?id=1REOz6rfcjPuIhIimsrpBwhifvmI9gCTx'
output = 'Arabic_news.zip'
gdown.download(url, output, quiet=False)

In [None]:
import zipfile
import os

if zipfile.is_zipfile(output):
    with zipfile.ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall('/content/')
    print("Files extracted:")
    print(os.listdir('/content/'))
else:
    print("Downloaded file is not a zip file.")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Arabic_news.csv')

# Display the first few rows of the DataFrame
df.head()

In [None]:
df.shape

In [None]:

df.describe()


In [None]:
df.isnull().sum() #no missing values

In [None]:
df3 = df.loc[:,['description', 'source_domain','url']]

In [None]:
df3

In [None]:
df3 = df3.rename(columns = {'description' : 'text'})

In [None]:
df3['source_domain'].value_counts().nlargest(20).plot(kind = 'bar')

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df3.groupby('source_domain').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
import plotly.express as px

# Use Plotly for visualisation
fig = px.bar(df3.groupby('source_domain').size().sort_values(ascending=False).reset_index(),
             x='source_domain', y=0,
             labels={'0': 'Count'},
             color='source_domain',  # Color bars based on source domain
             color_discrete_sequence=px.colors.qualitative.Pastel)  # Use a color palette

fig.update_layout(
    title='Distribution of News Sources',
    xaxis_title='Source Domain',
    yaxis_title='Count',
    showlegend=False
)

fig.show()


Data prep

### 1.2 Preprocess the text data

We will use spaCy to preprocess the text data, including tokenization and stop words removal.

In [None]:
# Display the distribution of news sources
source_counts = df['source_domain'].value_counts()
print(source_counts)
# Determine the minimum count of articles across all sources
min_count = source_counts.min()
print(f"\nMinimum count of articles across all sources: {min_count}")

# Create a balanced subset
balanced_df = df.groupby('source_domain').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Display the distribution of news sources in the balanced subset
balanced_source_counts = balanced_df['source_domain'].value_counts()
print("\nDistribution of news sources in the balanced subset:")
print(balanced_source_counts)

# Save the balanced dataset to a new CSV file
balanced_df.to_csv('BalancedArabicNewsDataset.csv', index=False)

print("\nBalanced dataset saved to 'BalancedArabicNewsDataset.csv'")

In [None]:
balanced_df

In [None]:
import spacy
from spacy.lang.ar import Arabic

# Load spaCy Arabic model
nlp = Arabic()

def preprocess_text(text):
    # Tokenize and remove stop words
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

balanced_df['clean_text'] = balanced_df['text'].apply(preprocess_text)

print("\nFirst few rows of the cleaned dataset:")
balanced_df[['text', 'clean_text']].head()


## Step 2: Data Visualization

### 2.1 Word Frequency Visualization

https://www.wordclouds.co.uk/


1. **Color Themes:**
   - `colormap='viridis'` sets a color theme for the word cloud. You can choose from various colormaps provided by Matplotlib (e.g., 'viridis', 'plasma', 'inferno', 'magma').

2. **Custom Shapes:**
   - `mask=mask` uses a custom mask image to shape the word cloud. You need to provide the path to an image file (e.g., a PNG file with a transparent background).

3. **Advanced Styling:**
   - `contour_width` and `contour_color` add contour lines to the word cloud.
   - `random_state=42` ensures reproducibility by setting a random state.
   - `max_words=200` limits the number of words displayed in the word cloud.
   - `stopwords=stopwords` removes common stop words from the word cloud.



Get font:

In [None]:
import gdown
url = 'https://drive.google.com/uc?id=1HwUK9hixOUcF_8mrpP2nD4zbyMA0utz2'
output = 'Arabic_font.ttf'
gdown.download(url, output, quiet=False)

Get Image:

In [None]:
url = 'https://drive.google.com/uc?id=1WoIBqqLufm9Ou3toI5uKVIcRuC-_Qswh'
output = 'Mic.png'
gdown.download(url, output, quiet=False)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

def visualize_word_cloud(texts, mask_path, font_path):
    # Combine all texts into one string
    all_words = ' '.join(texts)

    # Load a mask image for custom shape
    mask = np.array(Image.open(mask_path))  # mask image path

    # Define stop words
    stopwords = set(STOPWORDS)

    # Create a WordCloud object with additional customization
    wordcloud = WordCloud(
        font_path=font_path,
        background_color='white',
        max_words=200,
        stopwords=stopwords,
        mask=mask,  # Use mask for custom shape
        contour_width=3,
        contour_color='firebrick',
        colormap='viridis',  # Color theme
        width=800,
        height=400,
        random_state=42
    ).generate(all_words)

    # Display the word cloud
    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()


visualize_word_cloud(balanced_df['clean_text'], 'Mic.png', 'Arabic_font.ttf')


In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Load data
df = pd.read_csv('BalancedArabicNewsDataset.csv', on_bad_lines='skip')

# Check for null values
print(df.isnull().sum())

# Fill or drop null values
df['description'] = df['description'].fillna('')
df['date_publish'] = df['date_publish'].fillna('Unknown')

# Use 'text' column for content and 'date_publish' or another column for labels
df['text'] = df['text'] + ' ' + df['description']  # Combining text and description if needed
df = df[['date_publish', 'text']]
df.columns = ['year', 'abstract']

# Ensure no null abstracts
df = df[pd.notnull(df['abstract'])]

# Create categories (labels) from the year column
df['category_id'] = df['year'].factorize()[0]
category_id_df = df[['year', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'year']].values)
df.head()

# Define models to use
modelsArray = ["SVM", "NB", "LR"]
for model_type in modelsArray:
    # Prepare the training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(df['abstract'], df['year'], random_state=1, test_size=0.30)

    # Vectorization
    count_vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
    count_vect.fit(X_train)
    X_train_tfidf = count_vect.transform(X_train)
    X_test_tfidf = count_vect.transform(X_test)

    # Algorithms setup
    if model_type == "SVM":
        clf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    if model_type == "NB":
        clf = MultinomialNB()
    if model_type == "LR":
        clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=4000)

    # Train the model
    train_model = clf.fit(X_train_tfidf, y_train)
    # Predicting years for testing data
    test_accuracy = train_model.predict(X_test_tfidf)
    # Print training and testing accuracy
    print("Training/Testing Accuracy", '\t', model_type, '\t', train_model.score(X_train_tfidf, y_train), '\t', train_model.score(X_test_tfidf, y_test))

    # Plot confusion matrices
    conf_mat = confusion_matrix(y_test, test_accuracy)
    fig, ax = plt.subplots(figsize=(9, 9))
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap="RdBu_r",
                xticklabels=category_id_df.year.values, yticklabels=category_id_df.year.values)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    # Save plot
    pltFileName = f'combined_{model_type}.pdf'
    plt.savefig(pltFileName)


now use what you learnet to visualise and explore more