In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("example_app") \
    .master("local[*]") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/20 13:18:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime

spark = SparkSession.builder.appName("CountryRankingChange").getOrCreate()

# fb_actual_users
user_data = [
    (1, "Alice", "active", "USA"),
    (2, "Bob", "active", "India"),
    (3, "Charlie", "inactive", "USA"),
    (4, "David", "active", "UK"),
    (5, "Eve", "active", "India"),
    (6, "Frank", "active", "UK"),
    (7, "Grace", "active", "Canada")
]

user_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("country", StringType(), True)
])

fb_actual_users = spark.createDataFrame(user_data, schema=user_schema)

# fb_comment_counts
comment_data = [
    (1, datetime(2021, 12, 5, 10), 5),
    (2, datetime(2021, 12, 10, 12), 8),
    (3, datetime(2021, 12, 15, 14), 2),
    (4, datetime(2021, 12, 20, 16), 3),
    (7, datetime(2021, 12, 15, 13), 4),
    (5, datetime(2022, 1, 5, 11), 7),
    (6, datetime(2022, 1, 10, 9), 7),
    (7, datetime(2022, 1, 15, 13), 6),
    (1, datetime(2022, 1, 20, 15), 3)
]

# 2021: india - 8 , usa - 7 , canada - 4, uk - 3

# 2022 : india -7, uk -7   , canada - 6 , usa -3 
# rank: 1,1,3,4
#dense_rank: 1,1,2,3



comment_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("created_dt", TimestampType(), True),
    StructField("no_comments", IntegerType(), True)
])

fb_comment_counts = spark.createDataFrame(comment_data, schema=comment_schema)


In [29]:
# Which countries have risen in rankings based on the number 
# of comments between December 2021 vs jan 2022(Avoid gaps between ranks)

from pyspark.sql.functions import col,sum, dense_rank
from pyspark.sql.window import Window

fb_actual_users.createOrReplaceTempView('users')

fb_comment_counts.createOrReplaceTempView('counts')

spark.sql("""
          with deck_2021 as (
          select country, sum(no_comments) as sum1, dense_rank() over (order By sum(no_comments) desc) as rank from users u join counts c on u.user_id = c.user_id 
          where c.created_dt between date'2021-12-01' and date'2021-12-31'
          group by 1 
          ),
         deck_2022(
          select country, sum(no_comments) as sum1, dense_rank() over (order By sum(no_comments) desc) as rank from users u join counts c on u.user_id = c.user_id 
          where c.created_dt between date'2022-01-01' and date'2022-01-31'
          group by 1 
          )

          select d2.country,d2.sum1  from deck_2021 d1 
          join deck_2022 d2
          on d1.country = d2.country 
          and d2.rank < d1.rank
          
          """)
# .show()

window_spec = Window.orderBy(sum(col('no_comments')).desc())
deck_2021 = fb_actual_users.join(fb_comment_counts,'user_id',how='left') \
    .filter("created_dt between date'2021-12-01' and date'2021-12-31'") \
        .groupBy(col('country')) \
        .agg(dense_rank().over(window_spec).alias("rank21")
)

deck_2022 = fb_actual_users.join(fb_comment_counts,'user_id',how='left') \
    .filter("created_dt between date'2022-01-01' and date'2022-01-31'") \
        .groupBy(col('country')) \
        .agg(dense_rank().over(window_spec).alias("rank22")
)

deck_2021.show()
deck_2022.show()


deck_2022.join(deck_2021, "country", how='inner').filter("rank22 < rank21").select("country").show()


25/05/20 14:55:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 1

+-------+------+
|country|rank21|
+-------+------+
|  India|     1|
|    USA|     2|
| Canada|     3|
|     UK|     4|
+-------+------+



25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 1

+-------+------+
|country|rank22|
+-------+------+
|  India|     1|
|     UK|     1|
| Canada|     2|
|    USA|     3|
+-------+------+



25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 14:55:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/20 1

+-------+
|country|
+-------+
| Canada|
|     UK|
+-------+



In [None]:
You are given two strings word1 and word2. Merge the strings by adding letters in alternating order, starting with word1. 
If a string is longer than the other, append the additional letters onto the end of the merged string.

Return the merged string.

 

Example 1:

Input: word1 = "abc", word2 = "pqr"
Output: "apbqcr"
Explanation: The merged string will be merged as so:
word1:  a   b   c
word2:    p   q   r
merged: a p b q c r
Example 2:

Input: word1 = "ab", word2 = "pqrs"
Output: "apbqrs"
Explanation: Notice that as word2 is longer, "rs" is appended to the end.
word1:  a   b 
word2:    p   q   r   s
merged: a p b q   r   s
Example 3:

Input: word1 = "abcd", word2 = "pq"
Output: "apbqcd"
Explanation: Notice that as word1 is longer, "cd" is appended to the end.
word1:  a   b   c   d
word2:    p   q 
merged: a p b q c   d

In [None]:
str1 = 'abcqwertyui'
str2 = 'pqrs' 


res = ''
for i in range(1,max(len(str2),len(str1))+1):
    # print(i)
    res = res+str1[:1]+str2[:1]
    # print(res)
    str2 = str2[1:]
    str1 = str1[1:]
    # print(str1,str2)

res
# str1[:1]

'apbqcrqswertyui'

<h1>------ pyspark interview questions ---- <h2>

In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create SparkSession
spark = SparkSession.builder \
    .appName("Create Sales DataFrame") \
    .getOrCreate()

# Sample data
data = [
    ("East", "A", 100),
    ("East", "B", 150),
    ("East", "C", 120),
    ("West", "A", 200),
    ("West", "B", 180),
    ("West", "C", 210)
]

# Define schema
schema = StructType([
    StructField("region", StringType(), True),
    StructField("product", StringType(), True),
    StructField("revenue", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()


25/05/20 16:46:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Stage 349:>                                                        (0 + 3) / 3]

+------+-------+-------+
|region|product|revenue|
+------+-------+-------+
|  East|      A|    100|
|  East|      B|    150|
|  East|      C|    120|
|  West|      A|    200|
|  West|      B|    180|
|  West|      C|    210|
+------+-------+-------+



                                                                                

In [None]:
# Scenario:
# Find employees whose salary falls within the top 20% highest salaries in their respective departments.

# | emp_id | name    | department | salary |
# |--------|--------|-----------|--------|
# | 4      | David  | IT        | 4500   |
# | 5      | Eva    | Finance   | 5000   |


window_spec = Window.partitionBy(col('department')).over(col("salary"))


df = df.withColumn('rank',dense_rank().over(window_spec))


git config --global user.name "hari"

git config --global user.email "kishorehari228@gmail.com"