## Web scrapping

# Part 1: Extracting Company Names
**The code will extract company names and save them to a CSV file**

# Part 2: Extracting Detailed Information
**This part of the code will read the company names from the CSV file, normalize the names for URL generation, and extract detailed information from the profile pages.**

In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Set up the WebDriver (Chrome in this case)
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Uncomment this line if you want to run the browser in headless mode
driver = webdriver.Chrome(options=options)

# Function to normalize company names for URL
def normalize_company_name(name):
    normalization_map = {
    "ALİENFENCE": "alenfence", 
    "MİA TEKNOLOJİ A.Ş.": "ma-teknoloj-a",
    "AṬLAS": "alas",
    "CHECKUPS": "checkups-cova"
            }
    if name in normalization_map:
        return normalization_map[name]
    name = name.lower()
    name = name.replace(' ', '-')
    name = name.replace('&', '')
    name = name.replace('–', '')
    name = name.replace('(', '')
    name = name.replace(')', '')
    name = name.replace('é', '')
    name = name.replace('ü', '')
    name = name.replace('ó', '')
    name = name.replace('ş', '')
    name = name.replace('è', '')
    name = name.replace('ü', '')
    name = name.replace('ö', '')
    name = name.replace('â', '')
    name = name.replace('\'', '')
    name = name.replace('/', '')
    name = name.replace('ñ', '')
    name = name.replace('ç', '')
    name = name.replace('ã', '')
    name = name.replace('ä', '')
    name = name.replace(',', '')
    name = name.replace('.', '')
    name = name.replace('|', '')
    name = name.replace('+', '')
    name = name.replace('~', '')
    name = name.replace(':', '')
    name = '-'.join(name.split())  # Remove multiple consecutive spaces and dashes
    return name

# Function to extract company names
def extract_company_names(driver):
    company_names = []
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        companies = driver.find_elements(By.CLASS_NAME, "heading")
        for company in companies:
            try:
                name = company.text.strip()
                if name and name not in company_names:
                    company_names.append(name)
            except Exception as e:
                print(f"Error extracting company name: {e}")
        
        # Scroll down to load more companies
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(9)  # Adjust sleep time as needed to wait for the page to load
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break
        last_height = new_height

    return company_names

# Function to extract detailed information from the profile page
def extract_detailed_info(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h4.group.card-title.inner.list-group-item-heading'))
        )
        name = driver.find_element(By.CSS_SELECTOR, 'h4.group.card-title.inner.list-group-item-heading').text
        try:
            stand_info = driver.find_element(By.CSS_SELECTOR, 'div.head_discription p').text
        except NoSuchElementException:
            stand_info = "No stand info available"
        
        try:
            country = driver.find_element(By.CSS_SELECTOR, 'div.head_discription p span').text.strip()
        except NoSuchElementException:
            country = "No country info available"
        
        try:
            description = driver.find_element(By.CSS_SELECTOR, 'p.group.inner.float-left').text.strip()
        except NoSuchElementException:
            description = "No description available"
        # Extract categories
        try:
            categories_elements = driver.find_elements(By.CSS_SELECTOR, 'ul.sector_block li')
            categories = [category.text for category in categories_elements]
            if not categories:
                categories = ["No categories available"]
        except NoSuchElementException:
            categories = ["No categories available"]


        return {
            "Company Name": name,
            "Stand Info": stand_info,
            "Country": country,
            "Description": description,
            "Skills": ", ".join(categories)
        }
    except TimeoutException:
        print(f"Timeout loading page: {url}")
        return None
    except NoSuchElementException as e:
        print(f"Element not found on page {url}: {e}")
        return None
    except Exception as e:
        print(f"Error extracting detailed data for a company: {e}")
        return None

try:
    # Navigate to the EXHIBITOR LIST page
    driver.get("https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor")

    # Wait for the page to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Extract company names
    all_company_names = extract_company_names(driver)

    # Save company names to CSV
    pd.DataFrame({"Company Names": all_company_names}).to_csv('company_names.csv', index=False)

    # Extract detailed information for each company
    all_companies_data = []
    base_url = "https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/"

    for company_name in all_company_names:
        normalized_name = normalize_company_name(company_name)
        company_url = base_url + normalized_name
        print(f"Scraping data for {company_name}: {company_url}")
        company_info = extract_detailed_info(driver, company_url)
        if company_info:
            all_companies_data.append(company_info)
        time.sleep(9)  # Adjust sleep time as needed to wait for the page to load

    # Save the data into a pandas DataFrame
    df = pd.DataFrame(all_companies_data)
    df.to_csv('mntdata.csv', index=False)
    print("Data saved to exhibitors_data.csv")

finally:
    # Close the WebDriver
    driver.quit()


Scraping data for 01TALENT AFRICA: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/01talent-africa
Scraping data for 212 FOUNDERS BY CDG INVEST: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/212-founders-by-cdg-invest
Scraping data for 3GCOM SARL: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/3gcom-sarl
Scraping data for 3LABAL.APP: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/3labalapp
Scraping data for 3N SYSTÈMES: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/3n-systmes
Scraping data for 42GEARS MOBILITY SYSTEMS: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/42gears-mobility-systems
Scraping data for 75WAY TECHNOLOGIES PRIVATE LIMITED: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/75way-t

## **processing**

This script ensures that companies listed in "company_names.csv" but not found in "mntdata.csv" are identified and saved to a new file for further processing.

In [13]:
import pandas as pd

# Load the company names from both CSV files
company_names_df = pd.read_csv('company_names.csv')
exhibitors_data_df = pd.read_csv('mntdata.csv')

# Print column names to debug
print("Company Names DataFrame columns:", company_names_df.columns)
print("Exhibitors Data DataFrame columns:", exhibitors_data_df.columns)

# Ensure column names are stripped of any leading/trailing whitespace
company_names_df.columns = company_names_df.columns.str.strip()
exhibitors_data_df.columns = exhibitors_data_df.columns.str.strip()

# Rename the column in company_names_df to match exhibitors_data_df
company_names_df.rename(columns={'Company Names': 'Company Name'}, inplace=True)

# Check if 'Company Name' exists in both DataFrames
if 'Company Name' not in company_names_df.columns or 'Company Name' not in exhibitors_data_df.columns:
    print("Column 'Company Name' not found in one or both DataFrames.")
else:
    # Convert company names to uppercase in both DataFrames for case-insensitive comparison
    company_names_df['Company Name'] = company_names_df['Company Name'].str.upper()
    exhibitors_data_df['Company Name'] = exhibitors_data_df['Company Name'].str.upper()

    # Find companies present in company_names_df but missing in exhibitors_data_df
    missing_companies = company_names_df[~company_names_df['Company Name'].isin(exhibitors_data_df['Company Name'])]

    # Save the missing companies to a new CSV file
    missing_companies.to_csv('missing_companies.csv', index=False)

    print("Missing companies saved to missing_companies.csv")


Company Names DataFrame columns: Index(['Company Names'], dtype='object')
Exhibitors Data DataFrame columns: Index(['Company Name', 'Stand Info', 'Country', 'Description', 'Skills'], dtype='object')
Missing companies saved to missing_companies.csv


In [14]:
missing_companies = pd.read_csv('missing_companies.csv')

In [15]:
missing_companies

Unnamed: 0,Company Name
0,ALMB | LOUERMABORNE.FR | ACHETERMABORNE.FR
1,FUJIFILM – PRINTING SOLUTIONS
2,IDX HAVACILIK ÇÖZÜMLERI VE BILIŞIM TEKNOLOJILE...
3,MİA TEKNOLOJİ A.Ş.
4,CHECKUPS
5,FÉDÉRATION MAROCAINE DES INDUSTRIES DE LA SANT...
6,ME® MOBILE ECOSYSTEMS


In [42]:
# Set up the WebDriver (Chrome in this case)
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Uncomment this line if you want to run the browser in headless mode
driver = webdriver.Chrome(options=options)

# Function to normalize company names for URL
def normalize_company_name(name):
    normalization_map = {
    "ALİENFENCE": "alenfence", 
    "MİA TEKNOLOJİ A.Ş.": "ma-teknoloj-a",
    "AṬLAS": "alas",
    "CHECKUPS": "checkups-cova"
            }
    if name in normalization_map:
        return normalization_map[name]
    name = name.lower()
    name = name.replace(' ', '-')
    name = name.replace('&', '')
    name = name.replace('–', '')
    name = name.replace('(', '')
    name = name.replace(')', '')
    name = name.replace('é', '')
    name = name.replace('®', '')
    name = name.replace('ü', '')
    name = name.replace('ó', '')
    name = name.replace('ş', '')
    name = name.replace('è', '')
    name = name.replace('ü', '')
    name = name.replace('ö', '')
    name = name.replace('â', '')
    name = name.replace('\'', '')
    name = name.replace('/', '')
    name = name.replace('ñ', '')
    name = name.replace('ç', '')
    name = name.replace('ã', '')
    name = name.replace('ä', '')
    name = name.replace(',', '')
    name = name.replace('.', '')
    name = name.replace('|', '')
    name = name.replace('+', '')
    name = name.replace('~', '')
    name = name.replace(':', '')
    name = '-'.join(name.split())  # Remove multiple consecutive spaces and dashes
    return name
# Function to extract detailed information from the profile page
def extract_detailed_info(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h4.group.card-title.inner.list-group-item-heading'))
        )
        name = driver.find_element(By.CSS_SELECTOR, 'h4.group.card-title.inner.list-group-item-heading').text
        try:
            stand_info = driver.find_element(By.CSS_SELECTOR, 'div.head_discription p').text
        except NoSuchElementException:
            stand_info = "No stand info available"
        
        try:
            country = driver.find_element(By.CSS_SELECTOR, 'div.head_discription p span').text.strip()
        except NoSuchElementException:
            country = "No country info available"
        
        try:
            description = driver.find_element(By.CSS_SELECTOR, 'p.group.inner.float-left').text.strip()
        except NoSuchElementException:
            description = "No description available"
        # Extract categories
        try:
            categories_elements = driver.find_elements(By.CSS_SELECTOR, 'ul.sector_block li')
            categories = [category.text for category in categories_elements]
            if not categories:
                categories = ["No categories available"]
        except NoSuchElementException:
            categories = ["No categories available"]
        return {
            "Company Name": name,
            "Stand Info": stand_info,
            "Country": country,
            "Description": description,
            "Skills": ", ".join(categories)
        }
    except TimeoutException:
        print(f"Timeout loading page: {url}")
        return None
    except NoSuchElementException as e:
        print(f"Element not found on page {url}: {e}")
        return None
    except Exception as e:
        print(f"Error extracting detailed data for a company: {e}")
        return None

try:
    # Read company names from the CSV file
    df_names = pd.read_csv('missing_companies.csv')
    all_company_names = df_names['Company Name'].tolist()

    # Extract detailed information for each company
    all_companies_data = []
    base_url = "https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/"

    for company_name in all_company_names:
        normalized_name = normalize_company_name(company_name)
        company_url = base_url + normalized_name
        print(f"Scraping data for {company_name}: {company_url}")
        company_info = extract_detailed_info(driver, company_url)
        if company_info:
            all_companies_data.append(company_info)
        time.sleep(5)  # Adjust sleep time as needed to wait for the page to load

    # Save the data into a pandas DataFrame
    df = pd.DataFrame(all_companies_data)
    df.to_csv('missing_data_info.csv', index=False)
    print("Data saved to missing_dat_info.csv")

finally:
    # Close the WebDriver
    driver.quit()


Scraping data for ALMB | LOUERMABORNE.FR | ACHETERMABORNE.FR: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/almb--louermabornefr--achetermabornefr
Scraping data for FUJIFILM – PRINTING SOLUTIONS: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/fujifilm--printing-solutions
Scraping data for IDX HAVACILIK ÇÖZÜMLERI VE BILIŞIM TEKNOLOJILERI TICARET A.Ş.: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/idx-havacilik-zmleri-ve-biliim-teknolojileri-ticaret-a
Timeout loading page: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/idx-havacilik-zmleri-ve-biliim-teknolojileri-ticaret-a
Scraping data for MİA TEKNOLOJİ A.Ş.: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDetails/ma-teknoloj-a
Scraping data for CHECKUPS: https://exhibitors-dwtc.exhibitoronlinemanual.com/gitex-africa-2024/Exhibitor/ExbDe

In [44]:
import pandas as pd

# Load the two CSV files
df1 = pd.read_csv('mntdata.csv')
df2 = pd.read_csv('missing_data_info.csv')

# Concatenate the two DataFrames by union
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('final_output.csv', index=False)

print("Data saved to final_output.csv.csv")


Data saved to final_output.csv.csv


## Processing data

Predict how likely it is for a company to integrate Microsoft's products and services based on its description, keywords and phrases that align with the capabilities and solutions offered by Microsoft

In [None]:
# Load the CSV file
df3 = pd.read_csv('data.csv') #final_output.csv

# Display the columns to identify the country column
#print("Columns in the DataFrame:", df3.columns)

# Assuming the column containing country information is named 'Country'
# If it has a different name, replace 'Country' with the correct column name
unique_countries = df3['Country'].unique()

# Print the unique countries
print("Unique countries in the data:", unique_countries)


Unique countries in the data: ['Portugal' 'Morocco' 'India' 'United States of America' 'China'
 'United Arab Emirates' 'Senegal' 'South Africa' 'Ethiopia' 'Saudi Arabia'
 'France' 'Egypt' 'Nigeria' 'Kenya' 'Cape Verde'
 "Cote d'Ivoire (Ivory Coast)" 'Pakistan' 'Türkiye' 'Uganda' 'Japan'
 'Switzerland' 'Virgin Islands (British)' 'Tunisia' 'Poland'
 'Czech Republic' 'Cyprus' 'United Kingdom' 'Germany' 'Azerbaijan'
 'Netherlands' 'Spain' 'Mauritania' 'Korea, Republic Of' 'Estonia'
 'Russian Federation' 'Mongolia' 'Benin' 'Lebanon' 'Cameroon' 'Canada'
 'Hong Kong Special Administrative Region' 'New Zealand' 'Italy'
 'Tanzania' 'Togo' 'Rwanda' 'Mauritius' 'Australia' 'Slovakia'
 'Uzbekistan' 'Iran' 'Kazakhstan' 'Taiwan Province of China' 'Norway'
 'Equatorial Guinea' 'Denmark' 'Congo, Democratic Republic Of The'
 'Republic of Ireland' 'Ghana' 'Burkina Faso' 'Zambia' 'Greece' 'Guinea'
 'Belarus' 'Jordan' 'Romania' 'Singapore' 'Malta' 'Luxembourg' 'Bulgaria'
 'Sudan' 'Oman' 'United States Min

In [10]:
# List of African and European countries
african_countries = [
    'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde',
    'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Congo', 'Democratic Republic of the Congo',
    'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon',
    'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia',
    'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique',
    'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles',
    'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia',
    'Uganda', 'Zambia', 'Zimbabwe'
]

european_countries = [
    'Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia and Herzegovina',
    'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France',
    'Georgia', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kazakhstan', 'Kosovo',
    'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro',
    'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino',
    'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine',
    'United Kingdom', 'Vatican City', 'Israel'
]

# Filter the DataFrame to keep only African and European countries
filtered_df = df3[df3['Country'].isin(african_countries + european_countries)]

# Print the filtered DataFrame
print(filtered_df)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_mntdata.csv', index=False)


                    Company Name                            Stand Info  \
0                01Talent Africa  Stand No - 14A-16, Hall No - Hall 14   
1     212 Founders by CDG Invest    Stand No - 9A-10, Hall No - Hall 9   
2                     3GCOM SARL  Stand No - 12C-61, Hall No - Hall 12   
3                     3labal.App    Stand No - 9D-23, Hall No - Hall 9   
4                    3N Systèmes    Stand No - 2D-12, Hall No - Hall 2   
...                          ...                                   ...   
1226                     VTC/VTL    Stand No - 8P-33, Hall No - Hall 8   
1227                   W-All Fit  Stand No - 13B-10, Hall No - Hall 13   
1228                       Weego  Stand No - 18B-48, Hall No - Hall 18   
1229         Workpay Africa Ltd.  Stand No - 18B-46, Hall No - Hall 18   
1230                         Yux     Stand No - 2A-9, Hall No - Hall 2   

       Country                                        Description  \
0     Portugal  01Talent Africa is both a 

In [17]:
# Define the company name you want to display
company_name = "DYN IT MAROC"

# Filter the DataFrame for the specific company
dyn_it = filtered_df[filtered_df['Company Name'].str.lower() == company_name.lower()]

# Check if the company exists in the data
if not dyn_it.empty:
    print(dyn_it)
else:
    print(f"No information found for the company: {company_name}")

In [18]:
dyn_it

Unnamed: 0,Company Name,Stand Info,Country,Description,Skills
298,DYN IT MAROC,"Stand No - 4B-25, Hall No - Hall 4",Morocco,Dyn IT Maroc offre une gamme de services axés ...,"Education Tech, Cloud Services, Big Data & Ana..."


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('data.csv') #filtered_mntdata.csv

# Define integration and competition keywords
integration_keywords = ['cloud', 'ai', 'machine learning', 'big data', 'analytics', 'microsoft', 'azure', 'office 365', 'collaboration', 'enterprise']
competition_keywords = ['saas', 'software development', 'platform', 'solution provider', 'custom software', 'digital transformation', 'it services']

# Function to calculate scores based on keywords in description and skills
def calculate_scores(description, skills):
    integration_score = sum(1 for keyword in integration_keywords if keyword in description.lower() or keyword in skills.lower())
    competition_score = sum(1 for keyword in competition_keywords if keyword in description.lower() or keyword in skills.lower())
    return integration_score, competition_score

# Fill NaN values in 'Description' and 'Skills' with empty strings
df['Description'] = df['Description'].fillna('')
df['Skills'] = df['Skills'].fillna('')

# Calculate scores for each company
df['Integration Score'] = df.apply(lambda row: calculate_scores(row['Description'], row['Skills'])[0], axis=1)
df['Competition Score'] = df.apply(lambda row: calculate_scores(row['Description'], row['Skills'])[1], axis=1)

# Categorize companies
df['Category'] = df.apply(lambda row: 'High Potential' if row['Integration Score'] > 3 and row['Competition Score'] < 2 else ('Competitor' if row['Competition Score'] > 2 else 'Moderate Potential'), axis=1)

# Define the order for the 'Category' column
category_order = {'High Potential': 0, 'Competitor': 1, 'Moderate Potential': 2}

# Sort companies by 'Category' first, then 'Integration Score' and 'Competition Score'
df_sorted = df.sort_values(by=['Category', 'Integration Score', 'Competition Score'], key=lambda x: x.map(category_order) if x.name == 'Category' else x, ascending=[True, False, True])

# Save the data into a pandas DataFrame
df_sorted.to_csv('companies_analysis_with_scores_sorted.csv', index=False)
print("Data saved to companies_analysis_with_scores_sorted.csv")

# Function to show information for a specific company
def show_company_info(df, company_name):
    company_info = df[df['Company Name'].str.contains(company_name, case=False, na=False)]
    if not company_info.empty:
        print(company_info[['Company Name', 'Integration Score', 'Competition Score', 'Category', 'Description', 'Skills']])
    else:
        print(f"No information found for company: {company_name}")

# Example: Show information for a specific company
#show_company_info(df_sorted, 'example company name')  # Replace with the actual company name


Data saved to companies_analysis_with_scores_sorted.csv
No information found for company: example company name


## Region Mapping

In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('organized.csv')

# Define the mapping of countries to regions
africa_countries = [
    "Cote d'Ivoire (Ivory Coast)",
    'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 
    'Chad', 'Comoros', 'Congo', 'Cote d\'Ivoire (Ivory Coast)', 'Democratic Republic of the Congo', 'Djibouti', 'Egypt', 
    'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 
    'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 
    'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 
    'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'
]

europe_countries = [
    'Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 
    'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Georgia', 'Germany', 'Greece', 
    'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kazakhstan', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 
    'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania','Republic of Ireland',
    'Russia', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Türkiye', 'Ukraine', 'United Kingdom'
]

asia_countries = [
    'Korea, Republic Of',
    'Afghanistan', 'Bahrain', 'Bangladesh', 'Bhutan', 'Brunei', 'Cambodia', 'China', 'East Timor', 'India', 'Indonesia', 
    'Iran', 'Iraq', 'Israel', 'Japan', 'Jordan', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Lebanon', 'Malaysia', 'Maldives', 'Hong Kong Special Administrative Region',
    'Mongolia', 'Myanmar', 'Nepal', 'North Korea', 'Oman', 'Pakistan', 'Palestine', 'Philippines', 'Qatar', 'Saudi Arabia', 
    'Singapore', 'South Korea', 'Sri Lanka', 'Syria', 'Taiwan', 'Tajikistan', 'Thailand', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Vietnam', 'Yemen'
]

oceania_countries = [
    'Australia', 'Fiji', 'Kiribati', 'Marshall Islands', 'Micronesia', 'Nauru', 'New Zealand', 'Palau', 'Papua New Guinea', 
    'Samoa', 'Solomon Islands', 'Tonga', 'Tuvalu', 'Vanuatu'
]

america_countries = [
    'Antigua and Barbuda', 'Argentina', 'Bahamas', 'Barbados', 'Belize', 'Bolivia', 'Brazil', 'Canada', 'Chile', 
    'Colombia', 'Costa Rica', 'Cuba', 'Dominica', 'Dominican Republic', 'Ecuador', 'El Salvador', 'Grenada', 'Guatemala', 
    'Guyana', 'Haiti', 'Honduras', 'Jamaica', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Saint Kitts and Nevis', 'United States of America',
    'Saint Lucia', 'Saint Vincent and the Grenadines', 'Suriname', 'Trinidad and Tobago', 'United States Minor Outlying Islands', 'Uruguay', 'Venezuela'
]

# Function to map countries to regions
def map_country_to_region(country):
    if country in africa_countries:
        return 'Africa'
    elif country in europe_countries:
        return 'Europe'
    elif country in asia_countries:
        return 'Asia'
    elif country in oceania_countries:
        return 'Oceania'
    elif country in america_countries:
        return 'America'
    else:
        return 'Other'

# Add the 'Region' column
df['Region'] = df['Country'].apply(map_country_to_region)

# Define the order for the 'Region' column
region_order = {'Africa': 0, 'Europe': 1, 'Asia': 2, 'Oceania': 3, 'America': 4, 'Other': 5}

# Sort the DataFrame by 'Region' first
df_sorted = df.sort_values(by='Region', key=lambda x: x.map(region_order))

# Save the sorted DataFrame to a new CSV file
df_sorted.to_csv('add_regions.csv', index=False)
print("Data saved to companies_analysis_with_regions_sorted.csv")


Data saved to companies_analysis_with_regions_sorted.csv


In [5]:
# Function to show information for a specific company
def show_company_info(df, company_name):
    company_info = df[df['Company Name'].str.contains(company_name, case=False, na=False)]
    if not company_info.empty:
        print(company_info[['Company Name', 'Description', 'Skills', 'Region']])
    else:
        print(f"No information found for company: {company_name}")
company_name = "DYN IT MAROC"
# Example: Show information for a specific company
show_company_info(df_sorted, company_name)  # Replace with the actual company name


     Company Name                                        Description  \
298  DYN IT MAROC  Dyn IT Maroc offre une gamme de services axés ...   

                                                Skills  Region  
298  Education Tech, Cloud Services, Big Data & Ana...  Africa  


## analyzing company descriptions and keywords
show the likelihood percentage of a company integrating Microsoft's products and services based on the provided keywords

In [6]:
import pandas as pd
# Load the CSV file 
df = pd.read_csv('data/companies_analysis_with_regions_sorted.csv')

# Remove companies with "No categories available" and "No description available"
# Remove companies with both "No categories available" and "No description available"
filtered_df = df[
    ~((df["Skills"] == "No categories available") & (df["Description"] == "No description available"))
]

# Save the filtered data to a new CSV file
filtered_df.to_csv('data/filtered_companies.csv', index=False)
print("Filtered data saved to filtered_companies.csv")


Filtered data saved to filtered_companies.csv


In [7]:
filtered_df

Unnamed: 0,Company Name,Stand Info,Country,Description,Skills,Region
1,SOPHATEL,"Stand No - 4E-12, Hall No - Hall 4",Morocco,SOPHATEL is an established IT company with ove...,No categories available,Africa
2,SOMAYAR,"Stand No - 13E-13, Hall No - Hall 13",Morocco,SOMAYAR se distingue également par ses partena...,"Smart Cities, Internet of Things (IOT), Applic...",Africa
3,iKnowFarm,"Stand No - 8P-10, Hall No - Hall 8",Uganda,"Since our launch in April 2022, we have worked...","AgriTech & FoodTech, Software Services, Agricu...",Africa
4,iladary,"Stand No - 8L-40, 8L-41, Hall No - Hall 8",Morocco,Iladary introduces a revolutionary delivery pl...,"Digital Finance, Incubator/Accelerator/Investm...",Africa
5,SOLUTION NUMÉRIQUE POUR L'AFRIQUE,"Stand No - 4E-18, Hall No - Hall 4",Morocco,Unlocking the power of education across Africa...,No categories available,Africa
...,...,...,...,...,...,...
1225,PJSC Softline,"Stand No - 3D-20, Hall No - Hall 3",Russian Federation,Softline Group is one of the IT market leaders...,"Smart Cities, Retail Tech, Internet of Things ...",Other
1226,Emigrante CV,"Stand No - 9K-15, Hall No - Hall 9",Cape Verde,Emigrante CV was founded in 2020 by three youn...,No categories available,Other
1227,Getac Technology Corporation,"Stand No - 2D-13, Hall No - Hall 2",Taiwan Province of China,LEADING THE WAY IN RUGGED TECHNOLOGY Getac Tec...,"Space Tech, Consumer Tech, Smart Home & Smart ...",Other
1229,KIVUGREEN CORPORATION (1),"Stand No - 8M-51, Hall No - Hall 8","Congo, Democratic Republic Of The",Overview: KivuGreen Corporation is a leading t...,No categories available,Other


In [10]:
import spacy
import pandas as pd

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV file (replace '/mnt/data/mntdata.csv' with your actual file path)
df = pd.read_csv('data/filtered_companies.csv')

# Combine Description and Skills for analysis
df["Combined"] = df["Description"].fillna('') + " " + df["Skills"].fillna('')

# Define keywords related to Microsoft's offerings
microsoft_keywords = [
    "Education Tech", "Cloud Services", "Big Data & Analytics", "Artificial Intelligence",
    "Chatbots / Virtual Assistant", "Computer Vision", "Data Extraction", "Deep Learning Platforms",
    "Facial / Iris Recognition", "Machine Learning", "Natural Language Processing (NLP)",
    "Neural Networks", "Optical character recognition (OCR)", "Predictive APIs", "Real time translation",
    "Speech Recognition", "Business Intelligence", "Cloud Solutions", "Dashboard", "Data Analytics",
    "Data Architecture", "Data Integration", "Data Lake", "Data Management", "Data Science",
    "Data Visualisation", "Data Warehouse", "DataOpsDevops", "Extract, transform, load - ETL",
    "Machine Learning", "Natural Language Processing - NLP", "Structured Query Language - SQL",
    "Data as a Service - DAAS", "DevOps", "Training or Recruitement", "Learning strategy",
    "Training centre resources"
]

# Function to calculate likelihood percentage
def calculate_likelihood_percentage(combined_text):
    doc = nlp(combined_text.lower())
    total_keywords = len(microsoft_keywords)
    matched_keywords = sum(1 for keyword in microsoft_keywords if keyword.lower() in combined_text.lower())
    likelihood_percentage = (matched_keywords / total_keywords) * 100
    return likelihood_percentage

# Calculate likelihood percentage for each company
df["Likelihood Percentage"] = df["Combined"].apply(calculate_likelihood_percentage)

# Display the results
print(df[["Company Name", "Likelihood Percentage"]])

# Save the results to a CSV file
df.to_csv('data/company_likelihood.csv', index=False)
print("Data saved to company_likelihood.csv")


                           Company Name  Likelihood Percentage
0                              SOPHATEL               0.000000
1                               SOMAYAR               2.702703
2                             iKnowFarm               0.000000
3                               iladary              10.810811
4     SOLUTION NUMÉRIQUE POUR L'AFRIQUE               0.000000
...                                 ...                    ...
1104                      PJSC Softline              18.918919
1105                       Emigrante CV               0.000000
1106       Getac Technology Corporation               2.702703
1107          KIVUGREEN CORPORATION (1)               0.000000
1108                       ExperienTrip               2.702703

[1109 rows x 2 columns]
Data saved to company_likelihood.csv
