<a href="https://colab.research.google.com/github/hanarayan/EPAM_PRACTICE/blob/main/PracticeEPAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkContext

# Check if a SparkContext already exists
try:
    sc = SparkContext.getOrCreate()
    print("Using existing SparkContext")
except ValueError:
    # If not, create a new one
    sc = SparkContext("local", "Linkdin Example")
    print("Created a new SparkContext")


In [12]:
rdd = sc.textFile("/content/LinkedIn people profiles datasets.csv")

In [None]:
header = rdd.first()
rdd_no_header = rdd.filter(lambda line: line != header)
print(f"Total rows (Including header): {rdd.count()}")
print(f"Total rows (excluding header): {rdd_no_header.count()}")

In [None]:


header_columns = header.split(",")
header_columns = [col.strip('""').strip().lower() for col in header_columns]

print("Available Columns:")
for column in header_columns:
    print(column)



In [21]:
rdd_split = rdd_no_header.map(lambda line: line.split(","))

In [55]:

def clean_column(code):
    cleaned_code = str(code).strip().strip('"')
    if not cleaned_code or cleaned_code.lower() in ["null", "no data","--"]:
        return "Not Available"
    return cleaned_code

In [63]:
def clean_digit(following_value):
    try:
        # Attempt to convert to an integer
        return int(following_value) if following_value.isdigit() else 0
    except ValueError:
        # If ValueError occurs, return 0
        return 0

In [None]:

country_code_index = header_columns.index("country_code")

country_codes_rdd = rdd_split.map(lambda row: clean_column(row[country_code_index]))
distinct_countries = country_codes_rdd.distinct().sortBy(lambda x: x.lower()).collect()

print("Distinct Country Codes:")
for country in distinct_countries:
    print(country)

In [None]:
region_index = header_columns.index("region")
regions_rdd  = rdd_split.map(lambda row: (clean_column(row[region_index]), 1))
region_counts = regions_rdd.reduceByKey(lambda a, b: a + b)
region_counts_result = region_counts.sortBy(lambda x: x).collect()

print("Region:" )
for region, count in region_counts_result:
    print(f"{region}, Count: {count}")


In [None]:
company_name_index = header_columns.index("current_company:name")
company_names_rdd = rdd_split.map(lambda row: clean_column(row[company_name_index]))
distinct_company_names_rdd = company_names_rdd.distinct().sortBy(lambda x: x.lower()).collect()

print("Distinct company names:")
for company in distinct_company_names_rdd:
    print(company)

In [None]:
following_index = header_columns.index("following")
name_index = header_columns.index("name")

people_rdd = rdd_split.map(lambda row: (
    clean_digit(clean_column(row[following_index])),  # Clean and convert the following field
    clean_column(row[name_index])  # Clean the name field
))


sorted_people_rdd = people_rdd.sortBy(lambda x: x[0], ascending=False)


top_10_followed = sorted_people_rdd.take(10)


print("Top 10 Most-Followed People:")
for following, name in top_10_followed:
    print(f"{name}: {following} followers")
